You can use this template import pandas as pd # Load the data into a pandas DataFrame df = pd.read_csv('your_dataset.csv') # Removing duplicates df = df.drop_duplicates() # Handling missing values df = df.dropna() # Drop rows with any missing values # or df = df.fillna(0) # Fill missing values with 0 # or df = df.fillna(df.mean()) # Fill missing values with column means # Removing unnecessary columns df = df.drop(['unnecessary_column1', 'unnecessary_column2'], axis=1) # Renaming columns df = df.rename(columns={'old_column_name': 'new_column_name'}) # Changing data types df['numeric_column'] = df['numeric_column'].astype(float) # Filtering data df = df[df['column'] > 100] # Filter rows based on a condition # Merging data df1 = pd.read_csv('data1.csv') df2 = pd.read_csv('data2.csv') merged_df = pd.merge(df1, df2, on='common_column') # Splitting columns df['new_column1'], df['new_column2'] = df['original_column'].str.split('delimiter', 1).str # Aggregating data df_agg = df.groupby('grouping_column')['numeric_column'].sum() # Transforming text df['text_column'] = df['text_column'].str.strip() # Handling inconsistent data df['column'] = df['column'].replace({'inconsistent_value1': 'correct_value1', 'inconsistent_value2': 'correct_value2'}) # Handling outliers df = df[(df['column'] > lower_threshold) & (df['column'] < upper_threshold)] # Correcting data errors df['date_column'] = pd.to_datetime(df['date_column'], errors='coerce') # Removing leading/trailing spaces df['text_column'] = df['text_column'].str.strip() # Handling special characters df['text_column'] = df['text_column'].str.replace('special_character', '') # Normalizing data df_normalized = (df - df.min()) / (df.max() - df.min()) # Handling data conversions df['numeric_column'] = df['numeric_column'] * conversion_factor # Handling data inconsistencies df['column'] = df['column'].replace({'inconsistent_name1': 'consistent_name1', 'inconsistent_name2': 'consistent_name2'}) # Handling data relationships # Depends on the specific scenario and data source # Refreshing data # Depends on the specific data source and refresh requirements # Save the cleaned data to a new CSV file df.to_csv('cleaned_dataset.csv', index=False) # Removing rows with specific conditions df = df[~(df['column'] == 'value_to_remove')] # Remove rows where 'column' equals 'value_to_remove' # Handling date/time data df['date_column'] = pd.to_datetime(df['date_column']) # Convert string to datetime format df['month'] = df['date_column'].dt.month # Extract month from datetime column # Handling string manipulation df['text_column'] = df['text_column'].str.lower() # Convert text to lowercase df['text_column'] = df['text_column'].str.replace('old_text', 'new_text') # Replace specific text # Handling numeric data df['numeric_column'] = pd.to_numeric(df['numeric_column'], errors='coerce') # Convert to numeric, replacing non-numeric values with NaN # Handling categorical data df['category_column'] = df['category_column'].astype('category') # Convert to categorical data type # Handling outliers using z-score from scipy.stats import zscore df = df[(np.abs(zscore(df['numeric_column'])) < 3)] # Remove rows with outliers # Handling duplicate values in a column df['column'] = df['column'].str.strip().str.lower() # Clean column values (e.g., remove leading/trailing spaces, convert to lowercase) df = df.drop_duplicates(subset=['column']) # Remove duplicate values based on 'column' # Handling inconsistent date formats df['date_column'] = pd.to_datetime(df['date_column'], format='%d-%m-%Y') # Convert date column with specific format # Handling text extraction df['extracted_value'] = df['text_column'].str.extract(r'(\d+)') # Extract numeric values from text using regular expressions # Handling data validation using regular expressions df = df[df['column'].str.contains(r'^[A-Za-z0-9_]+$')] # Keep rows where 'column' matches a specific pattern # Handling data imputation df['numeric_column'] = df['numeric_column'].fillna(df['numeric_column'].mean()) # Fill missing values with column mean # Handling data normalization from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() df[['numeric_column1', 'numeric_column2']] = scaler.fit_transform(df[['numeric_column1', 'numeric_column2']]) # Handling data binning df['bin_column'] = pd.cut(df['numeric_column'], bins=5) # Bin numeric values into specified intervals # Handling data deduplication based on multiple columns df = df.drop_duplicates(subset=['column1', 'column2', 'column3']) # Handling data validation using custom functions def validate_data(row): if row['column1'] < row['column2']: return True else: return False df = df[df.apply(validate_data, axis=1)] # Keep rows that pass the custom validation function # Handling data standardization from sklearn.preprocessing import StandardScaler scaler = StandardScaler() df[['numeric_column1', 'numeric_column2']] = scaler.fit_transform(df[['numeric_column1', 'numeric_column2']]) # Handling data sampling sampled_df = df.sample(n=100, random_state=42) # Randomly sample 100 rows from the DataFrame # Handling data transformation df['transformed_column'] = df['numeric_column'].apply(lambda x: x ** 2) # Apply a custom transformation to a column # Handling data aggregation df_agg = df.groupby('category_column')['numeric_column'].mean() # Calculate the mean of 'numeric_column' grouped by 'category_column'
More videos like this one, I was just a little in doubt, if I go up to powerbi services, will this data transformation work normally? Or will something else be needed? because, I realized that you need python3 on your machine
Hi Meiry, You need a Personal Gateway installed on the machine where the file is located, and where Python is installed, then everything will work fine on the Power BI Service
You can use this template
import pandas as pd
# Load the data into a pandas DataFrame
df = pd.read_csv('your_dataset.csv')
# Removing duplicates
df = df.drop_duplicates()
# Handling missing values
df = df.dropna() # Drop rows with any missing values
# or
df = df.fillna(0) # Fill missing values with 0
# or
df = df.fillna(df.mean()) # Fill missing values with column means
# Removing unnecessary columns
df = df.drop(['unnecessary_column1', 'unnecessary_column2'], axis=1)
# Renaming columns
df = df.rename(columns={'old_column_name': 'new_column_name'})
# Changing data types
df['numeric_column'] = df['numeric_column'].astype(float)
# Filtering data
df = df[df['column'] > 100] # Filter rows based on a condition
# Merging data
df1 = pd.read_csv('data1.csv')
df2 = pd.read_csv('data2.csv')
merged_df = pd.merge(df1, df2, on='common_column')
# Splitting columns
df['new_column1'], df['new_column2'] = df['original_column'].str.split('delimiter', 1).str
# Aggregating data
df_agg = df.groupby('grouping_column')['numeric_column'].sum()
# Transforming text
df['text_column'] = df['text_column'].str.strip()
# Handling inconsistent data
df['column'] = df['column'].replace({'inconsistent_value1': 'correct_value1', 'inconsistent_value2': 'correct_value2'})
# Handling outliers
df = df[(df['column'] > lower_threshold) & (df['column'] < upper_threshold)]
# Correcting data errors
df['date_column'] = pd.to_datetime(df['date_column'], errors='coerce')
# Removing leading/trailing spaces
df['text_column'] = df['text_column'].str.strip()
# Handling special characters
df['text_column'] = df['text_column'].str.replace('special_character', '')
# Normalizing data
df_normalized = (df - df.min()) / (df.max() - df.min())
# Handling data conversions
df['numeric_column'] = df['numeric_column'] * conversion_factor
# Handling data inconsistencies
df['column'] = df['column'].replace({'inconsistent_name1': 'consistent_name1', 'inconsistent_name2': 'consistent_name2'})
# Handling data relationships
# Depends on the specific scenario and data source
# Refreshing data
# Depends on the specific data source and refresh requirements
# Save the cleaned data to a new CSV file
df.to_csv('cleaned_dataset.csv', index=False)
# Removing rows with specific conditions
df = df[~(df['column'] == 'value_to_remove')] # Remove rows where 'column' equals 'value_to_remove'
# Handling date/time data
df['date_column'] = pd.to_datetime(df['date_column']) # Convert string to datetime format
df['month'] = df['date_column'].dt.month # Extract month from datetime column
# Handling string manipulation
df['text_column'] = df['text_column'].str.lower() # Convert text to lowercase
df['text_column'] = df['text_column'].str.replace('old_text', 'new_text') # Replace specific text
# Handling numeric data
df['numeric_column'] = pd.to_numeric(df['numeric_column'], errors='coerce') # Convert to numeric, replacing non-numeric values with NaN
# Handling categorical data
df['category_column'] = df['category_column'].astype('category') # Convert to categorical data type
# Handling outliers using z-score
from scipy.stats import zscore
df = df[(np.abs(zscore(df['numeric_column'])) < 3)] # Remove rows with outliers
# Handling duplicate values in a column
df['column'] = df['column'].str.strip().str.lower() # Clean column values (e.g., remove leading/trailing spaces, convert to lowercase)
df = df.drop_duplicates(subset=['column']) # Remove duplicate values based on 'column'
# Handling inconsistent date formats
df['date_column'] = pd.to_datetime(df['date_column'], format='%d-%m-%Y') # Convert date column with specific format
# Handling text extraction
df['extracted_value'] = df['text_column'].str.extract(r'(\d+)') # Extract numeric values from text using regular expressions
# Handling data validation using regular expressions
df = df[df['column'].str.contains(r'^[A-Za-z0-9_]+$')] # Keep rows where 'column' matches a specific pattern
# Handling data imputation
df['numeric_column'] = df['numeric_column'].fillna(df['numeric_column'].mean()) # Fill missing values with column mean
# Handling data normalization
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df[['numeric_column1', 'numeric_column2']] = scaler.fit_transform(df[['numeric_column1', 'numeric_column2']])
# Handling data binning
df['bin_column'] = pd.cut(df['numeric_column'], bins=5) # Bin numeric values into specified intervals
# Handling data deduplication based on multiple columns
df = df.drop_duplicates(subset=['column1', 'column2', 'column3'])
# Handling data validation using custom functions
def validate_data(row):
if row['column1'] < row['column2']:
return True
else:
return False
df = df[df.apply(validate_data, axis=1)] # Keep rows that pass the custom validation function
# Handling data standardization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df[['numeric_column1', 'numeric_column2']] = scaler.fit_transform(df[['numeric_column1', 'numeric_column2']])
# Handling data sampling
sampled_df = df.sample(n=100, random_state=42) # Randomly sample 100 rows from the DataFrame
# Handling data transformation
df['transformed_column'] = df['numeric_column'].apply(lambda x: x ** 2) # Apply a custom transformation to a column
# Handling data aggregation
df_agg = df.groupby('category_column')['numeric_column'].mean() # Calculate the mean of 'numeric_column' grouped by 'category_column'
Very informative
Your content is always very helpful and important thnx
Thanks Prema !
Please give us more videos about python in power bi. Its really helpful. ❤
Thanks, sure will upload!
More videos like this one, I was just a little in doubt, if I go up to powerbi services, will this data transformation work normally? Or will something else be needed? because, I realized that you need python3 on your machine
Hi Meiry, You need a Personal Gateway installed on the
machine where the file is located, and where Python is installed, then everything will work fine on the Power BI Service
Hw can you see your data set behind Run Python Script window. For me its showing dataset and table and I need to click table to see the changes.
Thanks for great video.
I have questions why we need to use panda script.. we can do same thing with applied steps and no need gayeway
Where can i get this dataset from ??
Hi , this can be done on any dataset
Please do video on Pandas
Sure, will do