Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/data_designer/config/sampler_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
from enum import Enum
from typing import Literal

import pandas as pd
from pydantic import Field, field_validator, model_validator
from typing_extensions import Self, TypeAlias

from data_designer import lazy_imports
from data_designer.config.base import ConfigBase
from data_designer.config.utils.constants import (
AVAILABLE_LOCALES,
Expand Down Expand Up @@ -113,7 +113,7 @@ class DatetimeSamplerParams(ConfigBase):
@classmethod
def _validate_param_is_datetime(cls, value: str) -> str:
try:
pd.to_datetime(value)
lazy_imports.pd.to_datetime(value)
except ValueError:
raise ValueError(f"Invalid datetime format: {value}")
return value
Expand Down
106 changes: 106 additions & 0 deletions src/data_designer/engine/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,108 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""
Engine module with fully automatic lazy loading.

This module automatically discovers ALL engine modules and their public classes/functions,
providing a facade that lazily imports components only when accessed. This significantly
improves import performance while requiring ZERO maintenance - just add a module and it's
automatically exported.

Note: Private modules (starting with _) are excluded from auto-discovery.
"""

import ast
import importlib
from pathlib import Path


def _discover_all_engine_exports() -> dict[str, tuple[str, str]]:
"""Automatically discover all public classes/functions in the engine package.

Scans the engine directory recursively for all Python files, parses them
with AST (without importing), and builds a mapping of all public exports.

Returns:
Dictionary mapping public names to (module_path, attribute_name) tuples.
"""
lazy_imports = {}
engine_dir = Path(__file__).parent

# Find all Python files in engine directory recursively
for py_file in engine_dir.rglob("*.py"):
# Skip __init__.py files and private modules (starting with _)
if py_file.name.startswith("_"):
continue

# Convert file path to module path
# e.g., dataset_builders/column_wise_builder.py -> data_designer.engine.dataset_builders.column_wise_builder
rel_path = py_file.relative_to(engine_dir.parent)
module_parts = list(rel_path.parts[:-1]) + [rel_path.stem]
module_path = ".".join(["data_designer"] + module_parts)

try:
# Parse the Python file with AST (doesn't import it - fast!)
with open(py_file, "r", encoding="utf-8") as f:
tree = ast.parse(f.read(), filename=str(py_file))

# Find all top-level public classes and functions
for node in tree.body:
if isinstance(node, (ast.ClassDef, ast.FunctionDef, ast.AsyncFunctionDef)):
name = node.name
# Only export public items (no leading underscore)
if not name.startswith("_"):
# Avoid name collisions - first one wins
if name not in lazy_imports:
lazy_imports[name] = (module_path, name)
except Exception:
# If AST parsing fails, skip this module silently
pass

return lazy_imports


# Cache for lazy imports - built on first access
_LAZY_IMPORTS_CACHE: dict[str, tuple[str, str]] | None = None


def __getattr__(name: str) -> object:
"""Lazily import engine components when accessed.

On first access, automatically discovers all public classes/functions in the
engine package. Subsequent accesses use the cached mapping for fast lookups.

Args:
name: The name of the attribute to import.

Returns:
The imported class, function, or object.

Raises:
AttributeError: If the attribute is not found in any engine module.
"""
global _LAZY_IMPORTS_CACHE

# Build cache on first access
if _LAZY_IMPORTS_CACHE is None:
_LAZY_IMPORTS_CACHE = _discover_all_engine_exports()

if name in _LAZY_IMPORTS_CACHE:
module_path, attr_name = _LAZY_IMPORTS_CACHE[name]
# Dynamically import the module
module = importlib.import_module(module_path)
# Get the attribute from the module
return getattr(module, attr_name)

raise AttributeError(f"module 'data_designer.engine' has no attribute {name!r}")


def __dir__() -> list[str]:
"""Return list of all available lazy imports for introspection."""
global _LAZY_IMPORTS_CACHE

# Build cache if not already built
if _LAZY_IMPORTS_CACHE is None:
_LAZY_IMPORTS_CACHE = _discover_all_engine_exports()

return list(_LAZY_IMPORTS_CACHE.keys())
22 changes: 10 additions & 12 deletions src/data_designer/interface/data_designer.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

from __future__ import annotations

import logging
from pathlib import Path

import pandas as pd

# Lazy-loaded third-party and engine components via facades
from data_designer import engine, lazy_imports
from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
from data_designer.config.config_builder import DataDesignerConfigBuilder
from data_designer.config.default_model_settings import (
Expand All @@ -29,10 +31,6 @@
PREDEFINED_PROVIDERS,
)
from data_designer.config.utils.info import InfoType, InterfaceInfo
from data_designer.engine.analysis.dataset_profiler import (
DataDesignerDatasetProfiler,
DatasetProfilerConfig,
)
from data_designer.engine.compiler import compile_data_designer_config
from data_designer.engine.dataset_builders.artifact_storage import ArtifactStorage
from data_designer.engine.dataset_builders.column_wise_builder import ColumnWiseDatasetBuilder
Expand Down Expand Up @@ -221,7 +219,7 @@ def preview(

dropped_columns = raw_dataset.columns.difference(processed_dataset.columns)
if len(dropped_columns) > 0:
dataset_for_profiler = pd.concat([processed_dataset, raw_dataset[dropped_columns]], axis=1)
dataset_for_profiler = lazy_imports.pd.concat([processed_dataset, raw_dataset[dropped_columns]], axis=1)
else:
dataset_for_profiler = processed_dataset

Expand All @@ -233,7 +231,7 @@ def preview(

if builder.artifact_storage.processors_outputs_path.exists():
processor_artifacts = {
processor_config.name: pd.read_parquet(
processor_config.name: lazy_imports.pd.read_parquet(
builder.artifact_storage.processors_outputs_path / f"{processor_config.name}.parquet",
dtype_backend="pyarrow",
).to_dict(orient="records")
Expand Down Expand Up @@ -364,10 +362,10 @@ def _create_dataset_builder(
)

def _create_dataset_profiler(
self, config_builder: DataDesignerConfigBuilder, resource_provider: ResourceProvider
) -> DataDesignerDatasetProfiler:
return DataDesignerDatasetProfiler(
config=DatasetProfilerConfig(
self, config_builder: DataDesignerConfigBuilder, resource_provider: engine.ResourceProvider
) -> engine.DataDesignerDatasetProfiler:
return engine.DataDesignerDatasetProfiler(
config=engine.DatasetProfilerConfig(
column_configs=config_builder.get_column_configs(),
column_profiler_configs=config_builder.get_profilers(),
),
Expand Down
43 changes: 43 additions & 0 deletions src/data_designer/lazy_imports.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""
Lazy imports facade for heavy third-party dependencies.

This module provides a centralized facade that lazily imports heavy dependencies
(pandas, pyarrow, etc.) only when accessed, significantly improving import performance.

Usage:
from data_designer import lazy_imports

df = lazy_imports.pd.DataFrame(...)
schema = lazy_imports.pq.read_schema(...)
"""


def __getattr__(name: str) -> object:
"""Lazily import heavy third-party dependencies when accessed.

This allows fast imports of data_designer while deferring loading of heavy
libraries like pandas and pyarrow until they're actually needed.

Supported imports:
- pd: pandas module
- pq: pyarrow.parquet module
"""
if name == "pd":
import pandas as pd

return pd
elif name == "pq":
import pyarrow.parquet as pq

return pq

raise AttributeError(f"module 'data_designer.lazy_imports' has no attribute {name!r}")


# For type checking
def __dir__() -> list[str]:
"""Return list of available lazy imports."""
return ["pd", "pq"]
Loading