-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdata_preprocessing.py
More file actions
76 lines (57 loc) · 2.31 KB
/
data_preprocessing.py
File metadata and controls
76 lines (57 loc) · 2.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# data_preprocessing.py
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
# Optional: For visualization and debugging (commented out)
# import matplotlib.pyplot as plt
# import seaborn as sns
# ------------------------------
# 1. Load and Inspect Dataset
# ------------------------------
# Load dataset
df = pd.read_csv('Dataset/FidelFolio_Dataset.csv')
# Define feature and target columns
features = [f'Feature{i}' for i in range(1, 29)]
targets = [' Target 1 ', ' Target 2 ', ' Target 3 ']
# Columns originally stored as object with commas
cols_to_convert = [f"Feature{i}" for i in [4, 5, 6, 7, 9]] + targets
# Convert string with commas to float
for col in cols_to_convert:
df[col] = df[col].astype(str).str.replace(',', '', regex=False)
df[col] = pd.to_numeric(df[col], errors='coerce')
# Sort by company and year for consistent time series processing
df = df.sort_values(by=["Company", "Year"])
# Ensure target columns are numeric
df[targets] = df[targets].apply(pd.to_numeric, errors='coerce')
# ------------------------------
# 2. Fill Missing Values
# ------------------------------
# Fill missing values in targets with company-wise mean, fallback to global mean
for target in targets:
company_mean = df.groupby('Company')[target].transform(lambda x: x.fillna(x.mean()))
df[target] = company_mean.fillna(df[target].mean())
# Fill missing values in features similarly
for feature in features:
company_mean = df.groupby('Company')[feature].transform(lambda x: x.fillna(x.mean()))
df[feature] = company_mean.fillna(df[feature].mean())
# ------------------------------
# 3. Handle Outliers (Winsorization)
# ------------------------------
def cap_outliers(series):
Q1 = series.quantile(0.25)
Q3 = series.quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
return series.clip(lower, upper)
df[features] = df[features].apply(cap_outliers)
# ------------------------------
# 4. Normalize Features
# ------------------------------
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])
# ------------------------------
# 5. Save Cleaned Dataset
# ------------------------------
df.to_csv('Dataset/FidelFolio_Dataset_Cleaned.csv', index=False)
print("Cleaned dataset saved to Dataset/FidelFolio_Dataset_Cleaned.csv")