Data Cleaning
HTML-код
- Опубликовано: 13 янв 2025
- #Below is the code used in the video
"""
@author: ambi
copyright = "Copyright (C) Ambi"
"""
#TOPIC 1 : Visualize Missing values
#TOPIC 2 : Dealing with unusual values
#TOPIC 3 Dealing with Missing Data, drop them or replace with mean/median/mode
#TOPIC 4 : Dealing with Outliers
#TOPIC 5 : Dealing with duplicates
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df=pd.read_excel('small_iris_dataset_demo_for_cleaning.xlsx')
df.info()
dfOriginal=df.copy()
plt.figure()
sns.heatmap(df.isna())
colName='sepal_length'
for x in df.index:
if df.loc[x,colName] lessthan 0 or df.loc[x, colName] moreThan 400:
df.loc[x,colName]= pd.NA
df.info()
num_missing_values=df.isnull().sum()
print(num_missing_values)
df.dropna(inplace=True)
lower_limit=df[colName].quantile(0.01)
upper_limit=df[colName].quantile(0.99)
df_filtered=df[(df[colName] lessThan upper_limit ) & (df[colName] moreThan lower_limit )]
df.boxplot(colName)
df_filtered.boxplot(colName)
df_cleaned=df_filtered.drop_duplicates()