# creating a dict of columns as to avoid checking multiple datatypes d={} for col in df.dtypes: if col[1] not in d: d[col[1]] = [col[0]] else:d[col[1]].append(col[0]) for key,val in d.items(): df.select(val).show() # write df to the location
int_cols = [col for col, dtype in df.dtypes if dtype == 'int'] string_cols = [col for col, dtype in df.dtypes if dtype == 'string'] float_cols = [col for col, dtype in df.dtypes if dtype == 'float'] Creating DataFrames for each data type int_df = df.select(int_cols) string_df = df.select(string_cols) float_df = df.select(float_cols)
# creating a dict of columns to avoid checking multiple datatypes d={} for col in df.dtypes: if col[1] not in d: d[col[1]] = [col[0]] else:d[col[1]].append(col[0]) print(d) for key,val in d.items(): df.select(val).show() # write df to the location # df.write.mode('overwrite').save(f'temp_loc/{key}')
Hi Sagar this Capgemini Data Engineer Interview Question - Round 1 | Save Multiple Columns in the DataFrame what was the experience the candidate has ?
my solution: dict={} for i in df.dtypes: if i[1] in dict.keys(): l=dict.get(i[1]) l.append(i[0]) dict.update({i[1]:l}) else: l=[] l.append(i[0]) dict.update({i[1]:l})
for i in dict.keys(): df_s=df.select(dict.get(i)) df_s.show() ##did show instead of writing
very good you are posting real interview questions many of them simply explain concer defentitiins
@@kunuturuaravindreddy5879 thanks
# creating a dict of columns as to avoid checking multiple datatypes
d={}
for col in df.dtypes:
if col[1] not in d:
d[col[1]] = [col[0]]
else:d[col[1]].append(col[0])
for key,val in d.items():
df.select(val).show()
# write df to the location
Good problem to solve. Thanks for posting sagar!
Thank you
int_cols = [col for col, dtype in df.dtypes if dtype == 'int']
string_cols = [col for col, dtype in df.dtypes if dtype == 'string']
float_cols = [col for col, dtype in df.dtypes if dtype == 'float']
Creating DataFrames for each data type
int_df = df.select(int_cols)
string_df = df.select(string_cols)
float_df = df.select(float_cols)
Thank you for posting this video. But, can you please post pyspark interview questions for freshers. Thank you!
# creating a dict of columns to avoid checking multiple datatypes
d={}
for col in df.dtypes:
if col[1] not in d:
d[col[1]] = [col[0]]
else:d[col[1]].append(col[0])
print(d)
for key,val in d.items():
df.select(val).show()
# write df to the location
# df.write.mode('overwrite').save(f'temp_loc/{key}')
Completed 👏
Great problem sagar
Thanks a lot Sir
Thank you
Shouldn’t you use append instead of overwrite
cool question
Were u asked for any imocha test ?
No
@@GeekCoders okk...
okay, is this internal functionality of conversion to parq format
yes
My Way Sir
intType = []
stringType = []
floatType = []
for i in df.dtypes:
if i[1] == 'int':
intType.append(i[0])
elif i[1] == 'string':
stringType.append(i[0])
elif i[1] == 'float':
floatType.append(i[0])
dfInt = df.select(*intType)
dfString = df.select(*stringType)
dfFloat = df.select(*floatType)
Nice
Hi Sagar
this Capgemini Data Engineer Interview Question - Round 1 | Save Multiple Columns in the DataFrame
what was the experience the candidate has ?
4 years
My solution is as follows:
string = df
integer = df
float = df
for i in df.dtypes:
if i[1]!='string' and i[1]=='int':
string = string.drop(i[0])
float = float.drop(i[0])
elif i[1]!='string' and i[1]=='float':
string = string.drop(i[0])
integer = integer.drop(i[0])
elif i[1]!='int' and i[1]=='string':
integer = integer.drop(i[0])
float = float.drop(i[0])
elif i[1]!='int' and i[1]=='float':
integer = integer.drop(i[0])
string = string.drop(i[0])
elif i[1]!='float' and i[1]=='string':
float = float.drop(i[0])
integer = integer.drop(i[0])
else:
float = float.drop(i[0])
string = string.drop(i[0])
print(string)
print(integer)
print(float)
my solution:
dict={}
for i in df.dtypes:
if i[1] in dict.keys():
l=dict.get(i[1])
l.append(i[0])
dict.update({i[1]:l})
else:
l=[]
l.append(i[0])
dict.update({i[1]:l})
for i in dict.keys():
df_s=df.select(dict.get(i))
df_s.show()
##did show instead of writing