# creating a dict of columns as to avoid checking multiple datatypes d={} for col in df.dtypes: if col[1] not in d: d[col[1]] = [col[0]] else:d[col[1]].append(col[0]) for key,val in d.items(): df.select(val).show() # write df to the location
int_cols = [col for col, dtype in df.dtypes if dtype == 'int'] string_cols = [col for col, dtype in df.dtypes if dtype == 'string'] float_cols = [col for col, dtype in df.dtypes if dtype == 'float'] Creating DataFrames for each data type int_df = df.select(int_cols) string_df = df.select(string_cols) float_df = df.select(float_cols)
# creating a dict of columns to avoid checking multiple datatypes d={} for col in df.dtypes: if col[1] not in d: d[col[1]] = [col[0]] else:d[col[1]].append(col[0]) print(d) for key,val in d.items(): df.select(val).show() # write df to the location # df.write.mode('overwrite').save(f'temp_loc/{key}')
Hi Sagar this Capgemini Data Engineer Interview Question - Round 1 | Save Multiple Columns in the DataFrame what was the experience the candidate has ?
my solution: dict={} for i in df.dtypes: if i[1] in dict.keys(): l=dict.get(i[1]) l.append(i[0]) dict.update({i[1]:l}) else: l=[] l.append(i[0]) dict.update({i[1]:l})
for i in dict.keys(): df_s=df.select(dict.get(i)) df_s.show() ##did show instead of writing
very good you are posting real interview questions many of them simply explain concer defentitiins
@@kunuturuaravindreddy5879 thanks
# creating a dict of columns as to avoid checking multiple datatypes
d={}
for col in df.dtypes:
if col[1] not in d:
d[col[1]] = [col[0]]
else:d[col[1]].append(col[0])
for key,val in d.items():
df.select(val).show()
# write df to the location
int_cols = [col for col, dtype in df.dtypes if dtype == 'int']
string_cols = [col for col, dtype in df.dtypes if dtype == 'string']
float_cols = [col for col, dtype in df.dtypes if dtype == 'float']
Creating DataFrames for each data type
int_df = df.select(int_cols)
string_df = df.select(string_cols)
float_df = df.select(float_cols)
# creating a dict of columns to avoid checking multiple datatypes
d={}
for col in df.dtypes:
if col[1] not in d:
d[col[1]] = [col[0]]
else:d[col[1]].append(col[0])
print(d)
for key,val in d.items():
df.select(val).show()
# write df to the location
# df.write.mode('overwrite').save(f'temp_loc/{key}')
Good problem to solve. Thanks for posting sagar!
Thank you
Thank you for posting this video. But, can you please post pyspark interview questions for freshers. Thank you!
My Way Sir
intType = []
stringType = []
floatType = []
for i in df.dtypes:
if i[1] == 'int':
intType.append(i[0])
elif i[1] == 'string':
stringType.append(i[0])
elif i[1] == 'float':
floatType.append(i[0])
dfInt = df.select(*intType)
dfString = df.select(*stringType)
dfFloat = df.select(*floatType)
Nice
Great problem sagar
Shouldn’t you use append instead of overwrite
Completed 👏
Were u asked for any imocha test ?
No
@@GeekCoders okk...
Thanks a lot Sir
Thank you
cool question
okay, is this internal functionality of conversion to parq format
yes
My solution is as follows:
string = df
integer = df
float = df
for i in df.dtypes:
if i[1]!='string' and i[1]=='int':
string = string.drop(i[0])
float = float.drop(i[0])
elif i[1]!='string' and i[1]=='float':
string = string.drop(i[0])
integer = integer.drop(i[0])
elif i[1]!='int' and i[1]=='string':
integer = integer.drop(i[0])
float = float.drop(i[0])
elif i[1]!='int' and i[1]=='float':
integer = integer.drop(i[0])
string = string.drop(i[0])
elif i[1]!='float' and i[1]=='string':
float = float.drop(i[0])
integer = integer.drop(i[0])
else:
float = float.drop(i[0])
string = string.drop(i[0])
print(string)
print(integer)
print(float)
Hi Sagar
this Capgemini Data Engineer Interview Question - Round 1 | Save Multiple Columns in the DataFrame
what was the experience the candidate has ?
4 years
my solution:
dict={}
for i in df.dtypes:
if i[1] in dict.keys():
l=dict.get(i[1])
l.append(i[0])
dict.update({i[1]:l})
else:
l=[]
l.append(i[0])
dict.update({i[1]:l})
for i in dict.keys():
df_s=df.select(dict.get(i))
df_s.show()
##did show instead of writing