from pyspark.sql import SparkSession from pyspark.sql.functions import col # Create a Spark session spark = SparkSession.builder.appName("ModeImputationExample").getOrCreate() # Assuming you have the Iris dataset loaded into a DataFrame named 'iris' # Replace 'path/to/iris.csv' with the actual path to your Iris dataset CSV file iris = spark.read.csv("path/to/iris.csv", header=True, inferSchema=True) # Display the original DataFrame print("Original DataFrame:") iris.show() # Replace missing values in the 'class' column with the mode class_mode = iris.groupBy("class").count().orderBy(col("count").desc()).first()["class"] iris_imputed = iris.na.fill({"class": class_mode}) # Display the DataFrame after mode imputation print("DataFrame after mode imputation:") iris_imputed.show() # Stop the Spark session spark.stop()
How would you do the mode imputation for the class column?
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
# Create a Spark session
spark = SparkSession.builder.appName("ModeImputationExample").getOrCreate()
# Assuming you have the Iris dataset loaded into a DataFrame named 'iris'
# Replace 'path/to/iris.csv' with the actual path to your Iris dataset CSV file
iris = spark.read.csv("path/to/iris.csv", header=True, inferSchema=True)
# Display the original DataFrame
print("Original DataFrame:")
iris.show()
# Replace missing values in the 'class' column with the mode
class_mode = iris.groupBy("class").count().orderBy(col("count").desc()).first()["class"]
iris_imputed = iris.na.fill({"class": class_mode})
# Display the DataFrame after mode imputation
print("DataFrame after mode imputation:")
iris_imputed.show()
# Stop the Spark session
spark.stop()