MIDRC
diff --git a/‎requirements.txt‎
Lines changed: 0 additions & 1 deletion b/‎requirements.txt‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎setup.py‎
Lines changed: 22 additions & 8 deletions b/‎setup.py‎
Lines changed: 22 additions & 8 deletions
diff --git a/‎src/Untitled.ipynb‎
Lines changed: 6 additions & 6 deletions b/‎src/Untitled.ipynb‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎src/midrc_react/core/aggregate_jsd_calc.py‎
Lines changed: 1 addition & 1 deletion b/‎src/midrc_react/core/aggregate_jsd_calc.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/midrc_react/core/excel_layout.py‎
Lines changed: 22 additions & 20 deletions b/‎src/midrc_react/core/excel_layout.py‎
Lines changed: 22 additions & 20 deletions
diff --git a/‎src/midrc_react/core/famd_calc.py‎
Lines changed: 5 additions & 2 deletions b/‎src/midrc_react/core/famd_calc.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎src/midrc_react/core/jsdconfig.py‎
Lines changed: 63 additions & 8 deletions b/‎src/midrc_react/core/jsdconfig.py‎
Lines changed: 63 additions & 8 deletions
@@ -13,5 +13,4 @@ seaborn>=0.13.2
 ipywidgets>=8.1.5
 ipython>=8.28.0
 python-dateutil>=2.9.0.post0
-tabulate>=0.9.0
 scikit-learn>=1.6.1
@@ -28,19 +28,33 @@
         "PySide6>=6.6.0",
         "scipy>=1.12.0",
         "openpyxl>=3.1.0",
-        "pytest>=8.1.0",
         "PyYAML>=6.0.0",
         "prince>=0.15.0",
-        "dash>=2.18.2",
-        "plotly>=5.24.1",
-        "matplotlib>=3.9.0",
-        "seaborn>=0.13.2",
-        "ipywidgets>=8.1.5",
-        "ipython>=8.28.0",
+        "pydantic>=2.7.0",
         "python-dateutil>=2.9.0.post0",
-        "tabulate>=0.9.0",
         "scikit-learn>=1.6.1",
     ],
+    optional_dependencies={
+        "ipython": [
+            "ipywidgets>=8.1.5",
+            "ipython>=8.28.0",
+            "matplotlib>=3.9.0",
+            "seaborn>=0.13.2",
+            "plotly>=5.24.1",
+        ],
+        "dash": [
+            "dash>=2.18.2",
+            "dash-bootstrap-components>=1.4.1",
+            "plotly>=5.24.1",
+        ],
+        "dev": [
+            "pytest>=8.1.0",
+            "black>=24.9.1",
+            "flake8>=6.1.0",
+            "mypy>=1.3.0",
+        ],
+    },
+
     python_requires=">=3.10",
     entry_points={
         "console_scripts": [
 
@@ -1561,7 +1561,7 @@
     "import importlib\n",
     "importlib.reload(ExcelLayout)\n",
     "\n",
-    "midrc_data = ExcelLayout.DataSource('MIDRC')\n",
+    "midrc_data = ExcelLayout.DataSourceConfig('MIDRC')\n",
     "print( midrc_data.sheets['Race'].columns.values() )\n",
     "print( midrc_data.sheets['Race'].df.columns )\n",
     "cols_to_use = midrc_data.sheets['Race'].df.columns.intersection(midrc_data.sheets['Race'].columns.values())\n",
@@ -1571,15 +1571,15 @@
     "midrc_race_data = np.asarray(midrc_data.sheets['Race'].df[cols_to_use].iloc[-1].values,dtype=float)\n",
     "print(midrc_race_data)\n",
     "\n",
-    "cdc_data = ExcelLayout.DataSource('CDC')\n",
+    "cdc_data = ExcelLayout.DataSourceConfig('CDC')\n",
     "#print( cdc_data.sheets['Race'].columns )\n",
     "cols_to_use = cdc_data.sheets['Race'].df.columns.intersection(cdc_data.sheets['Race'].columns.values())\n",
     "#Remove date column\n",
     "cols_to_use = cols_to_use[1:]\n",
     "cdc_race_data = np.asarray(cdc_data.sheets['Race'].df[cols_to_use].iloc[-1].values,dtype=float)\n",
     "print(cdc_race_data)\n",
     "\n",
-    "census_data = ExcelLayout.DataSource('Census')\n",
+    "census_data = ExcelLayout.DataSourceConfig('Census')\n",
     "#print( census_data.sheets['Race'].columns.values() )\n",
     "#print( census_data.sheets['Race'].df.columns )\n",
     "cols_to_use = census_data.sheets['Race'].df.columns.intersection(census_data.sheets['Race'].columns.values())\n",
@@ -1607,17 +1607,17 @@
     "\n",
     "sheet_name = 'Race'\n",
     "\n",
-    "midrc_data = ExcelLayout.DataSource('MIDRC')\n",
+    "midrc_data = ExcelLayout.DataSourceConfig('MIDRC')\n",
     "cols_to_use = midrc_data.sheets[sheet_name].df.columns.intersection(midrc_data.sheets[sheet_name].columns.values())\n",
     "cols_to_use = cols_to_use[1:]\n",
     "midrc_sheet_data = np.asarray(midrc_data.sheets[sheet_name].df[cols_to_use].iloc[-1].values,dtype=float)\n",
     "\n",
-    "cdc_data = ExcelLayout.DataSource('CDC')\n",
+    "cdc_data = ExcelLayout.DataSourceConfig('CDC')\n",
     "cols_to_use = cdc_data.sheets[sheet_name].df.columns.intersection(cdc_data.sheets[sheet_name].columns.values())\n",
     "cols_to_use = cols_to_use[1:]\n",
     "cdc_sheet_data = np.asarray(cdc_data.sheets[sheet_name].df[cols_to_use].iloc[-1].values,dtype=float)\n",
     "\n",
-    "census_data = ExcelLayout.DataSource('Census')\n",
+    "census_data = ExcelLayout.DataSourceConfig('Census')\n",
     "cols_to_use = census_data.sheets[sheet_name].df.columns.intersection(census_data.sheets[sheet_name].columns.values())\n",
     "cols_to_use = cols_to_use[1:]\n",
     "census_sheet_data = np.asarray(census_data.sheets[sheet_name].df[cols_to_use].iloc[-1].values,dtype=float)\n",
 
@@ -70,7 +70,7 @@ def calc_jsd_by_features_combined(combined_df: pd.DataFrame, cols_to_use: list[s
     # Convert dataset columns to string in case they are integers
     pivot_table.columns = pivot_table.columns.astype(str)
 
-    labels = combined_df[dataset_column].unique().astype(str)
+    labels = sorted(combined_df[dataset_column].unique().astype(str))
 
     # Create a dictionary to hold counts for each dataset
     counts_dict = {dataset: pivot_table[dataset].values if dataset in pivot_table else np.zeros(len(pivot_table)) for
 
@@ -41,20 +41,20 @@ def __init__(self, data_source, custom_age_ranges=None):
             data_source (dict): The data source configuration.
             custom_age_ranges (dict, optional): A dictionary of custom age ranges.
         """
-        self.name = data_source['name']
+        self.name = data_source.name
         self.sheets = {}
-        self.datatype = data_source['data type']
-        self.filename = data_source['filename']
+        self.datatype = data_source.data_type
+        self.filename = data_source.filename
         self.data_source = data_source
         self.custom_age_ranges = custom_age_ranges
-        self._numeric_cols = data_source.get('numeric_cols', {})  # Extract numeric columns from config
-        self._columns = data_source.get('columns', [])
+        self._numeric_cols = data_source.numeric_cols  # Extract numeric columns from config
+        self._columns = data_source.columns
         self.raw_data = None
 
         # Load preprocessing plugin if specified
         self.preprocessor = None
-        if 'plugin' in data_source and data_source['plugin']:
-            plugin_name = data_source['plugin']
+        if data_source.plugin:
+            plugin_name = data_source.plugin
             plugin_path = os.path.join("plugins", f"{plugin_name}.py")
             self.preprocessor = DataSource.load_plugin(plugin_path)
 
@@ -64,8 +64,8 @@ def __init__(self, data_source, custom_age_ranges=None):
                 self.build_data_frames_from_csv(self.filename)
             else:
                 self.build_data_frames_from_file(self.filename)
-        if self.datatype == 'content' and 'content' in data_source:
-            self.build_data_frames_from_content(data_source['content'])
+        if self.datatype == 'content' and hasattr(data_source, 'content') and data_source.content is not None:
+            self.build_data_frames_from_content(data_source.content)
 
     def raw_columns_to_use(self):
         """
@@ -126,9 +126,9 @@ def apply_numeric_column_adjustments(self, df: pd.DataFrame):
             pd.DataFrame: The DataFrame with numeric column adjustments.
         """
         for str_col, col_dict in self._numeric_cols.items():
-            num_col = col_dict['raw column'] if 'raw column' in col_dict else str_col
-            bins = col_dict['bins'] if 'bins' in col_dict else None
-            labels = col_dict['labels'] if 'labels' in col_dict else None
+            num_col = col_dict.raw_column if hasattr(col_dict, 'raw_column') else str_col
+            bins = col_dict.bins if hasattr(col_dict, 'bins') else None
+            labels = col_dict.labels if hasattr(col_dict, 'labels') else None
 
             if num_col in df.columns:
                 df = bin_dataframe_column(df, num_col, str_col, bins=bins, labels=labels)
@@ -139,7 +139,6 @@ def apply_numeric_column_adjustments(self, df: pd.DataFrame):
                 # else:
                 #     # Default "N-N" format conversion
                 #     df[str_col] = df[num_col].apply(lambda x: f'{int(x)}-{int(x)}' if pd.notna(x) else x)
-
         return df
 
     def build_data_frames_from_csv(self, filename: str):
@@ -226,7 +225,7 @@ def create_sheets_from_df(self, df: pd.DataFrame):
             if col in df.columns:
                 df_cumsum = self.calculate_cumulative_sums(df, col)
                 if col in self._numeric_cols:
-                    labels = self._numeric_cols[col].get('labels', None)
+                    labels = self._numeric_cols[col].labels if hasattr(self._numeric_cols[col], 'labels') else None
                     if labels:
                         # The first column (e.g., date) remains at index 0.
                         date_column = df_cumsum.columns[0]
@@ -333,25 +332,28 @@ def _process_date_column(self, data_source: dict):
         """
 
         # This assumes that the first column is either the date column or does not have useful data
-        if data_source.get('date'):
+        date_value = getattr(data_source, 'date', None)
+        if date_value:
             self._df.drop(self._df.columns[0], axis=1, inplace=True)
-            self._df.insert(0, 'date', data_source['date'], False)
+            self._df.insert(0, 'date', date_value, False)
 
         self._df['date'] = pd.to_datetime(self._df['date'], errors='coerce')
 
         self._columns['date'] = self._df.columns[0]
 
-    def _process_columns(self, data_source: dict):
+    def _process_columns(self, data_source):
         """
         Process and rename columns according to the data source settings.
 
         Args:
-            data_source (dict): The data source object.
+            data_source (DataSource): The data source object.
         """
         for col in self._df.columns[1:]:
             col_name = col
-            if 'remove column name text' in data_source:
-                for txt in data_source['remove column name text']:
+            # Access remove_column_name_text from pydantic model
+            remove_text = getattr(data_source, 'remove_column_name_text', None)
+            if remove_text:
+                for txt in remove_text:
                     col_name = col.split(txt)[0]
             col_name = col_name.rstrip()
             self._columns[col_name] = col
 
@@ -22,7 +22,6 @@
 import numpy as np
 import pandas as pd
 import prince
-from tabulate import tabulate
 
 from midrc_react.core.data_preprocessing import combine_datasets_from_list
 from midrc_react.core.numeric_distances import calc_distances_via_df, scale_feature
@@ -132,7 +131,11 @@ def calc_famd_df(raw_df, cols_to_use, numeric_cols, dataset_column='_dataset_',
         if len(outlier_df) > 0:
             outlier_df = outlier_df.sort_values(by=famd_column, ascending=False)
             print(f"Outliers in FAMD fitting: {outlier_df.shape[0]}")
-            print(tabulate(outlier_df, headers='keys', tablefmt='psql'))
+            try:
+                from tabulate import tabulate
+                print(tabulate(outlier_df, headers='keys', tablefmt='psql'))
+            except ImportError:
+                print(outlier_df)
 
     return c_df
 
 
@@ -17,17 +17,64 @@
 This module contains the JSDConfig class, which loads and stores data from a YAML file.
 """
 
-from dataclasses import dataclass, field
 import os
+from typing import List, Optional, Dict, Union, Any
 
+from pydantic import BaseModel, Field, ValidationError
+from pydantic.dataclasses import dataclass
 from yaml import load
 try:
     from yaml import CLoader as Loader
 except ImportError:
     from yaml import Loader
 
 
-@dataclass
+class NumericColumnConfig(BaseModel):
+    """
+    NumericColumnConfig model to represent numeric column configurations in the YAML configuration.
+    """
+    raw_column: str = Field(..., alias='raw column')
+    bins: List[float]
+    labels: Optional[List[str]] = None
+    adjust_outliers: bool = Field(False, alias='adjust outliers')
+
+class DataSourceConfig(BaseModel):
+    """
+    DataSource model to represent individual data sources in the YAML configuration.
+    """
+    name: str
+    description: Optional[str] = None
+    data_type: str = Field(..., alias='data type')
+    filename: str
+    columns: Optional[List[str]] = None
+    numeric_cols: Optional[Dict[str, NumericColumnConfig]] = None
+    plugin: Optional[str] = None
+    date: Optional[str] = None
+    remove_column_name_text: Optional[List[str]] = Field(None, alias='remove column name text')
+
+    content: Optional[Any] = None  # Placeholder for loaded content
+    content_type: Optional[str] = None  # Placeholder for content type after loading
+
+    class Config:
+        validate_by_name = True
+        extra = 'allow'
+
+DataSourceConfigList = List[DataSourceConfig]
+
+class ConfigData(BaseModel):
+    """
+    ConfigData model to represent the structure of the YAML configuration data.
+    """
+    # Define fields based on expected YAML structure
+    data_sources: DataSourceConfigList = Field(..., alias='data sources')
+    custom_age_ranges: Optional[Dict[str, List[Union[int, float]]]] = Field(None, alias='custom_age_range')
+
+    class Config:
+        validate_by_name = True
+        # accept extra fields in the YAML
+        extra = 'allow'
+
+
 class JSDConfig:
     """
     The JSDConfig class loads and stores data from a YAML file.
@@ -38,13 +85,16 @@ class JSDConfig:
 
     Methods:
         __init__(self, filename='jsdconfig.yaml'): Initializes a new instance of JSDConfig.
-        __post_init__(self): Loads the YAML data from the current filename.
+        _load_data(self): Loads the YAML data from the current filename.
+        set_filename(self, new_filename): Sets a new filename and reloads the data.
     """
-    filename: str = 'jsdconfig.yaml'
-    data: dict = field(init=False)
+    filename: str
+    data: Optional[ConfigData]
 
-    def __post_init__(self):
+    def __init__(self, filename: str = 'jsdconfig.yaml'):
         """Load the YAML data from the current filename."""
+        self.filename = filename
+        self.data = None
         # os.chdir(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))
         self._load_data()
 
@@ -53,11 +103,16 @@ def _load_data(self):
         if not os.path.exists(self.filename):
             print(f"File {self.filename} does not exist. Skipping load.")
             print(f"Current working directory: {os.getcwd()}")
-            self.data = {}
+            self.data = None
             return
 
         with open(self.filename, 'r', encoding='utf-8') as stream:
-            self.data = load(stream, Loader=Loader)
+            raw = load(stream, Loader=Loader)
+        try:
+            self.data = ConfigData(**raw)
+        except ValidationError as e:
+            self.data = None
+            raise
         # print(dump(self.data))
 
     def set_filename(self, new_filename: str):