EDIT 27/01/2022
I will test solution of @hpchavaz
part of slow issue came from function called during loop. 2 of these function load csv files (pd.read_csv). I optimize by loading only columns needed (and not the all file).
And after, I tried to avoid loop:
- by using pd.melt to 'transpose' columns in rows
- by using apply wth lambda
I have divided execution time by 5 (227 minutes -> 45 minutes)
pd.set_option("display.max_rows", None, "display.max_columns", None)
# pd.set_option('max_colwidth', None)
# page 332 of SDTMIG3.3 Guidelines
# Finding About
domain = 'FA'
print(f'Domain {domain}')
print(datetime.datetime.now().strftime('%Y-%m-%d - %H:%M:%S'))
start = datetime.datetime.now()
# Données sources
df_randomisation = pd.read_csv(f'./csv/source/randomisation.csv',delimiter=",")
df_randomisation = df_randomisation.loc[:, df_randomisation.columns!='redcap_repeat_instance'] # exclude column redcap_repeat_instance as randomisation form are unique
df_randomisation = df_randomisation.loc[:, df_randomisation.columns!='redcap_event_name'] # exclude column redcap_repeat_name as randomisation form are unique
df_randomisation = df_randomisation.loc[:, df_randomisation.columns!='ID'] # exclude column redcap_repeat_name as randomisation form are unique
df_clinical_exam = pd.read_csv(f'./csv/source/examen_clinique_symptomes.csv',delimiter=",")
df = pd.merge(df_randomisation,df_clinical_exam, left_on='pat_ide', right_on='pat_ide', how='outer').sort_values(by=['pat_ide','sym_dat'])
# filtre sur un patient pour les tests
# df = df[(df['pat_ide'] == 'BFBO001')]
# print(df)
# Transformation des données sources : évènements en colonnes => évènements en lignes
df1 = pd.melt(df, id_vars=['pat_ide','sym_dat','redcap_event_name'], value_vars=['sym_tou', 'sym_gra','sym_ete','sym_fie','sym_fat','sym_dys','sym_mya','sym_cep','sym_gor','sym_cut','sym_ore','sym_nez','sym_odo','sym_gou','sym_ver','sym_vis','sym_aud','sym_tho','sym_pal','sym_nau','sym_vom','sym_dia','sym_abd','sym_mal','sym_con'],var_name='symptome', value_name='intensite')
df2 = pd.melt(df, id_vars=['pat_ide','sym_dat'], value_vars=['sym_tou_del','sym_gra_del','sym_ete_del','sym_fie_del','sym_fat_del','sym_dys_del','sym_mya_del','sym_cep_del','sym_gor_del','sym_cut_del','sym_ore_del','sym_nez_del','sym_odo_del','sym_gou_del','sym_ver_del','sym_vis_del','sym_aud_del','sym_tho_del','sym_pal_del','sym_nau_del','sym_vom_del','sym_dia_del','sym_abd_del','sym_mal_del','sym_con_del'],var_name='symptome', value_name='delai')
df2['symptome'] = df2.apply(lambda row: row['symptome'][:-4],axis=1)
# print(df1)
# print(df2)
# list of 25 monitored clinical signs
signs = {
'sym_tou' : 'Dry cough',
'sym_gra' : 'Fatty cough',
'sym_ete' : 'Sneezing',
'sym_fie' : 'Fever',
'sym_fat' : 'Tiredness',
'sym_dys' : 'Dyspnea',
'sym_mya' : 'Muscle aches (Myalgia)',
'sym_cep' : 'Headache',
'sym_gor' : 'Sore throat',
'sym_cut' : 'Skin rash',
'sym_ore' : 'Earache',
'sym_nez' : 'Runny nose (Rhinorrhea)',
'sym_odo' : 'Loss of smell (Anosmia)',
'sym_gou' : 'Loss of taste (Ageusia)',
'sym_ver' : 'Feeling dizzy',
'sym_vis' : 'Acute vision impairment',
'sym_aud' : 'Acute hearing impairment',
'sym_tho' : 'Chest pain',
'sym_pal' : 'Palpitations',
'sym_nau' : 'Nausea',
'sym_vom' : 'Vomiting',
'sym_dia' : 'Diarrhea',
'sym_abd' : 'Abdominal pain',
'sym_mal' : 'Inexplicated tiredness',
'sym_con' : 'Conjunctivitis'
}
dataframe = pd.DataFrame()
dataframe = pd.merge(df1,df2, left_on=['pat_ide','sym_dat','symptome'], right_on=['pat_ide','sym_dat','symptome'], how='outer').fillna('NULL')
dataframe['symptome'] = dataframe.apply(lambda row: signs[row['symptome']],axis=1)
dataframe = dataframe.sort_values(by=['pat_ide','sym_dat'])
# print(dataframe)
# print(len(dataframe.index))
# FA
sdtm_variables = ['STUDYID','DOMAIN','USUBJID','FASEQ','FATESTCD','FATEST','FAOBJ','FACAT','FAORRES','FAORRESU','FASTRESC','FASTRESN','FASTAT','VISITNUM','VISIT','VISITDY','EPOCH','FADTC','FADY','SYM_DAT','DEL','pat_ide']
df_fa = pd.DataFrame(columns = sdtm_variables)
df_fa['pat_ide'] = dataframe.apply(lambda row: row['pat_ide'],axis=1)
df_fa['STUDYID'] = dataframe.apply(lambda row: '01-COV',axis=1)
df_fa['DOMAIN'] = dataframe.apply(lambda row: 'FA',axis=1)
df_fa['USUBJID'] = dataframe.apply(lambda row: Patient_icon_format(row['pat_ide']),axis=1) # index of df
df_fa['FASEQ'] = dataframe.apply(lambda row: 1,axis=1) # pd.RangeIndex(stop=dataframe.shape[0]) + 1
df_fa['FATESTCD'] = dataframe.apply(lambda row:'OCCUR',axis=1)
df_fa['FATEST'] = dataframe.apply(lambda row:'Occurrence Indicator',axis=1)
df_fa['FAOBJ'] = dataframe.apply(lambda row: row['symptome'],axis=1)
df_fa['FACAT'] = dataframe.apply(lambda row: 'COVID-19 SYMPTOMS',axis=1)
df_fa['FAORRES'] = dataframe.apply(lambda row: Clinical_result(row['intensite']),axis=1)
df_fa['FAORRESU'] = dataframe.apply(lambda row: '',axis=1)
df_fa['FASTRESC'] = dataframe.apply(lambda row: Clinical_standard_result(row['intensite']),axis=1)
df_fa['FASTRESN'] = dataframe.apply(lambda row: np.nan,axis=1)
df_fa['FASTAT'] = dataframe.apply(lambda row: 'NOT DONE' if Clinical_result(row['intensite']) == '' else '',axis=1)
df_fa['VISITNUM'] = dataframe.apply(lambda row: str(Visit_numbering(row['pat_ide'],row['sym_dat'],row['redcap_event_name'])),axis=1)
df_fa['VISIT'] = dataframe.apply(lambda row: Visit_name(row['pat_ide'],row['sym_dat'],row['redcap_event_name']),axis=1)
df_fa['VISITDY'] = dataframe.apply(lambda row: '-1' if Visit_numbering(row['pat_ide'],row['sym_dat'],row['redcap_event_name']) == 1 else str(round((Visit_numbering(row['pat_ide'],row['sym_dat'],row['redcap_event_name']) - 1))),axis=1)
df_fa['EPOCH'] = dataframe.apply(lambda row: Get_epoch(row['pat_ide'],row['sym_dat'],row['redcap_event_name']),axis=1)
df_fa['FADTC'] = dataframe.apply(lambda row: pd.to_datetime(row['sym_dat']).strftime('%Y-%m-%dT%H:%M:%S') if row['sym_dat'] != 'NULL' else None,axis=1)
df_fa['FADY'] = dataframe.apply(lambda row: Number_of_day_since_the_subject_start_the_study(dataframe,row['sym_dat'],Patient_study_start_date(row['pat_ide'])),axis=1)
df_fa['SYM_DAT'] = dataframe.apply(lambda row: row['sym_dat'],axis=1)
df_fa['DEL'] = dataframe.apply(lambda row: row['delai'],axis=1)
# add line numbers of event (based on pat_ide and date of event) => use of redacp_repeat_instance variable of the event form
df_fa = df_fa.sort_values(by=['pat_ide','FADTC'])
df_fa = Sequence(df_fa,'FASEQ','FADTC')
df_fa = df_fa.sort_values(by=['pat_ide','FADTC'])
print(df_fa)
# SUPPFA
# creation of empty dataframes
# 2 dataframes because 2 'SUPP' variables are necessary for SUPPFA (FASTDTC et FAPRESP)
df_suppfa1 = pd.DataFrame(columns = ['STUDYID','RDOMAIN','USUBJID','IDVAR','IDVARVAL','QNAM','QLABEL','QVAL','QORIG','QEVAL','pat_ide'])
df_suppfa2 = pd.DataFrame(columns = ['STUDYID','RDOMAIN','USUBJID','IDVAR','IDVARVAL','QNAM','QLABEL','QVAL','QORIG','QEVAL','pat_ide'])
df_suppfa1['pat_ide'] = df_fa.apply(lambda row: row['pat_ide'],axis=1)
df_suppfa1['STUDYID'] = '01-COV'
df_suppfa1['RDOMAIN'] = 'FA'
df_suppfa1['USUBJID'] = df_fa.apply(lambda row: row['USUBJID'],axis=1)
df_suppfa1['IDVAR'] = 'FASEQ'
df_suppfa1['IDVARVAL'] = df_fa.apply(lambda row: row['FASEQ'],axis=1)
df_suppfa1['QNAM'] = 'FASTDTC'
df_suppfa1['QLABEL'] = 'COVID-19 Symptom Start Date'
df_suppfa1['QVAL'] = df_fa.apply(lambda row: (pd.to_datetime(row['SYM_DAT']) + timedelta(days = - int(row['DEL']))).strftime('%Y-%m-%d') if row['DEL'] != 'NULL' else row['SYM_DAT'],axis=1)
df_suppfa1['QORIG'] = 'CRF'
df_suppfa1['QEVAL'] = ''
df_suppfa2['pat_ide'] = df_fa.apply(lambda row: row['pat_ide'],axis=1)
df_suppfa2['STUDYID'] = '01-COV'
df_suppfa2['RDOMAIN'] = 'FA'
df_suppfa2['USUBJID'] = df_fa.apply(lambda row: row['USUBJID'],axis=1)
df_suppfa2['IDVAR'] = 'FASEQ'
df_suppfa2['IDVARVAL'] = df_fa.apply(lambda row: row['FASEQ'],axis=1)
df_suppfa2['QNAM'] = 'FAPRESP'
df_suppfa2['QLABEL'] = 'Prespecified'
df_suppfa2['QVAL'] = 'Y'
df_suppfa2['QORIG'] = 'CRF'
df_suppfa2['QEVAL'] = ''
dataframes = [df_suppfa1,df_suppfa2]
df_suppfa = pd.concat(dataframes)
df_suppfa = df_suppfa.drop(['pat_ide',], axis=1, errors='ignore')
# print(df_suppfa)
#-----------------------------------------------------------------------------------------------------------------------
sdtm_variables.remove('pat_ide')
sdtm_variables.remove('SYM_DAT')
sdtm_variables.remove('DEL')
# print(sdtm_variables)
df_fa = df_fa[sdtm_variables]
df_fa = df_fa.replace('NULL','')
# 3. Export data to csv
df_fa.to_csv(f'./xpt/{domain.lower()}_refactored.csv',index=False)
df_suppfa.to_csv(f'./xpt/supp{domain.lower()}_refactored.csv',index=False)
print(datetime.datetime.now().strftime('%Y-%m-%d - %H:%M:%S'))
print('Execution time',datetime.datetime.now() - start)
EDIT
normally, there 25 monitored clinical signs (and not only 4) and 2713 row in source dataframe
I have a Python code using the pandas library that works but is too slow because I use nested for-loops, and raw data become more important.
# initial dataframe: return 9 rows
df = pd.DataFrame([
['PAT001','2022-01-23',1,2,0,0,2,3,3,1],
['PAT001','2022-01-24',1,2,0,0,2,3,3,1],
['PAT001','2022-01-25',1,2,0,0,2,3,3,1],
['PAT002','2022-01-23',1,2,0,0,2,3,3,1],
['PAT002','2022-01-24',1,2,0,0,2,3,3,1],
['PAT002','2022-01-25',1,2,0,0,2,3,3,1],
['PAT002','2022-01-26',1,2,0,0,2,3,3,1],
], columns=['pat_ide','sym_dat','sym_tou','sym_gra','sym_ete','sym_fie','sym_tou_del','sym_gra_del','sym_ete_del','sym_fie_del'])
tmp_df_exams = pd.DataFrame(columns = ['pat_ide','DOMAIN','USUBJID','FAOBJ','SYM_DAT','DEL'])
# list of 4 monitored clinical signs
signs = {
'sym_tou' : 'Dry cough',
'sym_gra' : 'Fatty cough',
'sym_ete' : 'Sneezing',
'sym_fie' : 'Fever'
}
for index, row in df.iterrows():
for k,v in signs.items():
tmp_df_exams = tmp_df_exams.append({
'pat_ide': row['pat_ide'],
'DOMAIN' : 'FA',
'USUBJID' : Patient_icon_format(row['pat_ide']),
'FAOBJ' : v,
'SYM_DAT' : row['sym_dat'],
'DEL' : row[f'{k}_del']
},ignore_index=True)
Currently, it takes hours to end on entire raw data. To have an idea, if I filter the initial dataframe (9 rows) it takes between 68s and 81s.
How can I improve performance?
OUTPUT

kandvcalled ?