Skip to content

Commit 130b187

Browse files
authored
Merge pull request #40 from MIDRC/feature/pip_support
Feature/pip support
2 parents 7045a69 + d6c6309 commit 130b187

22 files changed

Lines changed: 291 additions & 190 deletions

requirements.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,5 +13,4 @@ seaborn>=0.13.2
1313
ipywidgets>=8.1.5
1414
ipython>=8.28.0
1515
python-dateutil>=2.9.0.post0
16-
tabulate>=0.9.0
1716
scikit-learn>=1.6.1

setup.py

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -28,19 +28,33 @@
2828
"PySide6>=6.6.0",
2929
"scipy>=1.12.0",
3030
"openpyxl>=3.1.0",
31-
"pytest>=8.1.0",
3231
"PyYAML>=6.0.0",
3332
"prince>=0.15.0",
34-
"dash>=2.18.2",
35-
"plotly>=5.24.1",
36-
"matplotlib>=3.9.0",
37-
"seaborn>=0.13.2",
38-
"ipywidgets>=8.1.5",
39-
"ipython>=8.28.0",
33+
"pydantic>=2.7.0",
4034
"python-dateutil>=2.9.0.post0",
41-
"tabulate>=0.9.0",
4235
"scikit-learn>=1.6.1",
4336
],
37+
optional_dependencies={
38+
"ipython": [
39+
"ipywidgets>=8.1.5",
40+
"ipython>=8.28.0",
41+
"matplotlib>=3.9.0",
42+
"seaborn>=0.13.2",
43+
"plotly>=5.24.1",
44+
],
45+
"dash": [
46+
"dash>=2.18.2",
47+
"dash-bootstrap-components>=1.4.1",
48+
"plotly>=5.24.1",
49+
],
50+
"dev": [
51+
"pytest>=8.1.0",
52+
"black>=24.9.1",
53+
"flake8>=6.1.0",
54+
"mypy>=1.3.0",
55+
],
56+
},
57+
4458
python_requires=">=3.10",
4559
entry_points={
4660
"console_scripts": [

src/Untitled.ipynb

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1561,7 +1561,7 @@
15611561
"import importlib\n",
15621562
"importlib.reload(ExcelLayout)\n",
15631563
"\n",
1564-
"midrc_data = ExcelLayout.DataSource('MIDRC')\n",
1564+
"midrc_data = ExcelLayout.DataSourceConfig('MIDRC')\n",
15651565
"print( midrc_data.sheets['Race'].columns.values() )\n",
15661566
"print( midrc_data.sheets['Race'].df.columns )\n",
15671567
"cols_to_use = midrc_data.sheets['Race'].df.columns.intersection(midrc_data.sheets['Race'].columns.values())\n",
@@ -1571,15 +1571,15 @@
15711571
"midrc_race_data = np.asarray(midrc_data.sheets['Race'].df[cols_to_use].iloc[-1].values,dtype=float)\n",
15721572
"print(midrc_race_data)\n",
15731573
"\n",
1574-
"cdc_data = ExcelLayout.DataSource('CDC')\n",
1574+
"cdc_data = ExcelLayout.DataSourceConfig('CDC')\n",
15751575
"#print( cdc_data.sheets['Race'].columns )\n",
15761576
"cols_to_use = cdc_data.sheets['Race'].df.columns.intersection(cdc_data.sheets['Race'].columns.values())\n",
15771577
"#Remove date column\n",
15781578
"cols_to_use = cols_to_use[1:]\n",
15791579
"cdc_race_data = np.asarray(cdc_data.sheets['Race'].df[cols_to_use].iloc[-1].values,dtype=float)\n",
15801580
"print(cdc_race_data)\n",
15811581
"\n",
1582-
"census_data = ExcelLayout.DataSource('Census')\n",
1582+
"census_data = ExcelLayout.DataSourceConfig('Census')\n",
15831583
"#print( census_data.sheets['Race'].columns.values() )\n",
15841584
"#print( census_data.sheets['Race'].df.columns )\n",
15851585
"cols_to_use = census_data.sheets['Race'].df.columns.intersection(census_data.sheets['Race'].columns.values())\n",
@@ -1607,17 +1607,17 @@
16071607
"\n",
16081608
"sheet_name = 'Race'\n",
16091609
"\n",
1610-
"midrc_data = ExcelLayout.DataSource('MIDRC')\n",
1610+
"midrc_data = ExcelLayout.DataSourceConfig('MIDRC')\n",
16111611
"cols_to_use = midrc_data.sheets[sheet_name].df.columns.intersection(midrc_data.sheets[sheet_name].columns.values())\n",
16121612
"cols_to_use = cols_to_use[1:]\n",
16131613
"midrc_sheet_data = np.asarray(midrc_data.sheets[sheet_name].df[cols_to_use].iloc[-1].values,dtype=float)\n",
16141614
"\n",
1615-
"cdc_data = ExcelLayout.DataSource('CDC')\n",
1615+
"cdc_data = ExcelLayout.DataSourceConfig('CDC')\n",
16161616
"cols_to_use = cdc_data.sheets[sheet_name].df.columns.intersection(cdc_data.sheets[sheet_name].columns.values())\n",
16171617
"cols_to_use = cols_to_use[1:]\n",
16181618
"cdc_sheet_data = np.asarray(cdc_data.sheets[sheet_name].df[cols_to_use].iloc[-1].values,dtype=float)\n",
16191619
"\n",
1620-
"census_data = ExcelLayout.DataSource('Census')\n",
1620+
"census_data = ExcelLayout.DataSourceConfig('Census')\n",
16211621
"cols_to_use = census_data.sheets[sheet_name].df.columns.intersection(census_data.sheets[sheet_name].columns.values())\n",
16221622
"cols_to_use = cols_to_use[1:]\n",
16231623
"census_sheet_data = np.asarray(census_data.sheets[sheet_name].df[cols_to_use].iloc[-1].values,dtype=float)\n",

src/midrc_react/core/aggregate_jsd_calc.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ def calc_jsd_by_features_combined(combined_df: pd.DataFrame, cols_to_use: list[s
7070
# Convert dataset columns to string in case they are integers
7171
pivot_table.columns = pivot_table.columns.astype(str)
7272

73-
labels = combined_df[dataset_column].unique().astype(str)
73+
labels = sorted(combined_df[dataset_column].unique().astype(str))
7474

7575
# Create a dictionary to hold counts for each dataset
7676
counts_dict = {dataset: pivot_table[dataset].values if dataset in pivot_table else np.zeros(len(pivot_table)) for

src/midrc_react/core/excel_layout.py

Lines changed: 22 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -41,20 +41,20 @@ def __init__(self, data_source, custom_age_ranges=None):
4141
data_source (dict): The data source configuration.
4242
custom_age_ranges (dict, optional): A dictionary of custom age ranges.
4343
"""
44-
self.name = data_source['name']
44+
self.name = data_source.name
4545
self.sheets = {}
46-
self.datatype = data_source['data type']
47-
self.filename = data_source['filename']
46+
self.datatype = data_source.data_type
47+
self.filename = data_source.filename
4848
self.data_source = data_source
4949
self.custom_age_ranges = custom_age_ranges
50-
self._numeric_cols = data_source.get('numeric_cols', {}) # Extract numeric columns from config
51-
self._columns = data_source.get('columns', [])
50+
self._numeric_cols = data_source.numeric_cols # Extract numeric columns from config
51+
self._columns = data_source.columns
5252
self.raw_data = None
5353

5454
# Load preprocessing plugin if specified
5555
self.preprocessor = None
56-
if 'plugin' in data_source and data_source['plugin']:
57-
plugin_name = data_source['plugin']
56+
if data_source.plugin:
57+
plugin_name = data_source.plugin
5858
plugin_path = os.path.join("plugins", f"{plugin_name}.py")
5959
self.preprocessor = DataSource.load_plugin(plugin_path)
6060

@@ -64,8 +64,8 @@ def __init__(self, data_source, custom_age_ranges=None):
6464
self.build_data_frames_from_csv(self.filename)
6565
else:
6666
self.build_data_frames_from_file(self.filename)
67-
if self.datatype == 'content' and 'content' in data_source:
68-
self.build_data_frames_from_content(data_source['content'])
67+
if self.datatype == 'content' and hasattr(data_source, 'content') and data_source.content is not None:
68+
self.build_data_frames_from_content(data_source.content)
6969

7070
def raw_columns_to_use(self):
7171
"""
@@ -126,9 +126,9 @@ def apply_numeric_column_adjustments(self, df: pd.DataFrame):
126126
pd.DataFrame: The DataFrame with numeric column adjustments.
127127
"""
128128
for str_col, col_dict in self._numeric_cols.items():
129-
num_col = col_dict['raw column'] if 'raw column' in col_dict else str_col
130-
bins = col_dict['bins'] if 'bins' in col_dict else None
131-
labels = col_dict['labels'] if 'labels' in col_dict else None
129+
num_col = col_dict.raw_column if hasattr(col_dict, 'raw_column') else str_col
130+
bins = col_dict.bins if hasattr(col_dict, 'bins') else None
131+
labels = col_dict.labels if hasattr(col_dict, 'labels') else None
132132

133133
if num_col in df.columns:
134134
df = bin_dataframe_column(df, num_col, str_col, bins=bins, labels=labels)
@@ -139,7 +139,6 @@ def apply_numeric_column_adjustments(self, df: pd.DataFrame):
139139
# else:
140140
# # Default "N-N" format conversion
141141
# df[str_col] = df[num_col].apply(lambda x: f'{int(x)}-{int(x)}' if pd.notna(x) else x)
142-
143142
return df
144143

145144
def build_data_frames_from_csv(self, filename: str):
@@ -226,7 +225,7 @@ def create_sheets_from_df(self, df: pd.DataFrame):
226225
if col in df.columns:
227226
df_cumsum = self.calculate_cumulative_sums(df, col)
228227
if col in self._numeric_cols:
229-
labels = self._numeric_cols[col].get('labels', None)
228+
labels = self._numeric_cols[col].labels if hasattr(self._numeric_cols[col], 'labels') else None
230229
if labels:
231230
# The first column (e.g., date) remains at index 0.
232231
date_column = df_cumsum.columns[0]
@@ -333,25 +332,28 @@ def _process_date_column(self, data_source: dict):
333332
"""
334333

335334
# This assumes that the first column is either the date column or does not have useful data
336-
if data_source.get('date'):
335+
date_value = getattr(data_source, 'date', None)
336+
if date_value:
337337
self._df.drop(self._df.columns[0], axis=1, inplace=True)
338-
self._df.insert(0, 'date', data_source['date'], False)
338+
self._df.insert(0, 'date', date_value, False)
339339

340340
self._df['date'] = pd.to_datetime(self._df['date'], errors='coerce')
341341

342342
self._columns['date'] = self._df.columns[0]
343343

344-
def _process_columns(self, data_source: dict):
344+
def _process_columns(self, data_source):
345345
"""
346346
Process and rename columns according to the data source settings.
347347
348348
Args:
349-
data_source (dict): The data source object.
349+
data_source (DataSource): The data source object.
350350
"""
351351
for col in self._df.columns[1:]:
352352
col_name = col
353-
if 'remove column name text' in data_source:
354-
for txt in data_source['remove column name text']:
353+
# Access remove_column_name_text from pydantic model
354+
remove_text = getattr(data_source, 'remove_column_name_text', None)
355+
if remove_text:
356+
for txt in remove_text:
355357
col_name = col.split(txt)[0]
356358
col_name = col_name.rstrip()
357359
self._columns[col_name] = col

src/midrc_react/core/famd_calc.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
import numpy as np
2323
import pandas as pd
2424
import prince
25-
from tabulate import tabulate
2625

2726
from midrc_react.core.data_preprocessing import combine_datasets_from_list
2827
from midrc_react.core.numeric_distances import calc_distances_via_df, scale_feature
@@ -132,7 +131,11 @@ def calc_famd_df(raw_df, cols_to_use, numeric_cols, dataset_column='_dataset_',
132131
if len(outlier_df) > 0:
133132
outlier_df = outlier_df.sort_values(by=famd_column, ascending=False)
134133
print(f"Outliers in FAMD fitting: {outlier_df.shape[0]}")
135-
print(tabulate(outlier_df, headers='keys', tablefmt='psql'))
134+
try:
135+
from tabulate import tabulate
136+
print(tabulate(outlier_df, headers='keys', tablefmt='psql'))
137+
except ImportError:
138+
print(outlier_df)
136139

137140
return c_df
138141

src/midrc_react/core/jsdconfig.py

Lines changed: 63 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,17 +17,64 @@
1717
This module contains the JSDConfig class, which loads and stores data from a YAML file.
1818
"""
1919

20-
from dataclasses import dataclass, field
2120
import os
21+
from typing import List, Optional, Dict, Union, Any
2222

23+
from pydantic import BaseModel, Field, ValidationError
24+
from pydantic.dataclasses import dataclass
2325
from yaml import load
2426
try:
2527
from yaml import CLoader as Loader
2628
except ImportError:
2729
from yaml import Loader
2830

2931

30-
@dataclass
32+
class NumericColumnConfig(BaseModel):
33+
"""
34+
NumericColumnConfig model to represent numeric column configurations in the YAML configuration.
35+
"""
36+
raw_column: str = Field(..., alias='raw column')
37+
bins: List[float]
38+
labels: Optional[List[str]] = None
39+
adjust_outliers: bool = Field(False, alias='adjust outliers')
40+
41+
class DataSourceConfig(BaseModel):
42+
"""
43+
DataSource model to represent individual data sources in the YAML configuration.
44+
"""
45+
name: str
46+
description: Optional[str] = None
47+
data_type: str = Field(..., alias='data type')
48+
filename: str
49+
columns: Optional[List[str]] = None
50+
numeric_cols: Optional[Dict[str, NumericColumnConfig]] = None
51+
plugin: Optional[str] = None
52+
date: Optional[str] = None
53+
remove_column_name_text: Optional[List[str]] = Field(None, alias='remove column name text')
54+
55+
content: Optional[Any] = None # Placeholder for loaded content
56+
content_type: Optional[str] = None # Placeholder for content type after loading
57+
58+
class Config:
59+
validate_by_name = True
60+
extra = 'allow'
61+
62+
DataSourceConfigList = List[DataSourceConfig]
63+
64+
class ConfigData(BaseModel):
65+
"""
66+
ConfigData model to represent the structure of the YAML configuration data.
67+
"""
68+
# Define fields based on expected YAML structure
69+
data_sources: DataSourceConfigList = Field(..., alias='data sources')
70+
custom_age_ranges: Optional[Dict[str, List[Union[int, float]]]] = Field(None, alias='custom_age_range')
71+
72+
class Config:
73+
validate_by_name = True
74+
# accept extra fields in the YAML
75+
extra = 'allow'
76+
77+
3178
class JSDConfig:
3279
"""
3380
The JSDConfig class loads and stores data from a YAML file.
@@ -38,13 +85,16 @@ class JSDConfig:
3885
3986
Methods:
4087
__init__(self, filename='jsdconfig.yaml'): Initializes a new instance of JSDConfig.
41-
__post_init__(self): Loads the YAML data from the current filename.
88+
_load_data(self): Loads the YAML data from the current filename.
89+
set_filename(self, new_filename): Sets a new filename and reloads the data.
4290
"""
43-
filename: str = 'jsdconfig.yaml'
44-
data: dict = field(init=False)
91+
filename: str
92+
data: Optional[ConfigData]
4593

46-
def __post_init__(self):
94+
def __init__(self, filename: str = 'jsdconfig.yaml'):
4795
"""Load the YAML data from the current filename."""
96+
self.filename = filename
97+
self.data = None
4898
# os.chdir(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))
4999
self._load_data()
50100

@@ -53,11 +103,16 @@ def _load_data(self):
53103
if not os.path.exists(self.filename):
54104
print(f"File {self.filename} does not exist. Skipping load.")
55105
print(f"Current working directory: {os.getcwd()}")
56-
self.data = {}
106+
self.data = None
57107
return
58108

59109
with open(self.filename, 'r', encoding='utf-8') as stream:
60-
self.data = load(stream, Loader=Loader)
110+
raw = load(stream, Loader=Loader)
111+
try:
112+
self.data = ConfigData(**raw)
113+
except ValidationError as e:
114+
self.data = None
115+
raise
61116
# print(dump(self.data))
62117

63118
def set_filename(self, new_filename: str):

0 commit comments

Comments
 (0)