-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathClean_visual.py
More file actions
95 lines (75 loc) · 2.24 KB
/
Clean_visual.py
File metadata and controls
95 lines (75 loc) · 2.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from word2number import w2n
import csv
#Choosing files and creating data frames
data=pd.read_csv("unclean_job_data.csv")
df=pd.DataFrame(data)
#print(df.to_string())
print(df.info())
#removing empty phone numbers
df.dropna(subset=["Phone Number"],inplace=True)
print(df.info())
#Changing Age to Numbers
def agenum(value):
if pd.isna(value):
return value
try:
return pd.to_numeric(value)
except:
try:
return w2n.word_to_num(value.strip().lower())
except:
return pd.NA
df["Age"]=df["Age"].apply(agenum)
#filling null age with Mean
x=df["Age"].mean()
df.fillna({"Age":x},inplace=True)
print(df.info())
#Filling empty emails with N/A
df.fillna({"Email":"None"},inplace=True)
print(df.info())
#Changing DataTypes In Salary To Numeric
def numbizer(value):
if pd.isna(value):
return value
try:
return pd.to_numeric(value)
except:
try:
return w2n.word_to_num(value.strip().lower())
except:
return pd.NA
df["Salary"]=df["Salary"].apply(numbizer)
print(df["Salary"].to_string())
print(df.info())
#Removing People Without Salary
df.dropna(subset=["Salary"],inplace=True)
print(df.info())
#Removing No-entry & Blank Phone Numbers
mask=df["Phone Number"].notna() & (df["Phone Number"].str.strip()!= "")
df=df[mask]
print(df["Phone Number"].to_string())
#Removing Duplicates
df.drop_duplicates(inplace=True)
#Changing DataType in Salary
df["Salary"]=df["Salary"].astype(int)
#Saving Cleaned Data In New Csv
with open("cleandata.csv","w",newline="") as file:
writer=csv.writer(file)
writer.writerow(df.columns)
writer.writerows(df.values)
#Checking for correlation between Salary and Age
correlation=(df.corr(numeric_only=True))
correlation=pd.DataFrame(correlation)
print(correlation)
correlation.to_csv("Age_Salary_Correlation.csv")
#Creating Visualizations
Tfont={"family":"serif","color":"blue","size":20}
axfont={"family":"fantasy","color":"darkred","size":15}
plt.plot(correlation)
plt.ylabel("Correlation Values",loc="center",fontdict=axfont)
plt.title("AGE vs SALARY CORRELATION",loc="left",fontdict=Tfont)
plt.grid(color="green",ls="--",lw=0.5)
plt.show()