1

EDIT 27/01/2022

I will test solution of @hpchavaz

part of slow issue came from function called during loop. 2 of these function load csv files (pd.read_csv). I optimize by loading only columns needed (and not the all file).

And after, I tried to avoid loop:

  1. by using pd.melt to 'transpose' columns in rows
  2. by using apply wth lambda

I have divided execution time by 5 (227 minutes -> 45 minutes)

pd.set_option("display.max_rows", None, "display.max_columns", None)
# pd.set_option('max_colwidth', None)

# page 332 of SDTMIG3.3 Guidelines
# Finding About
domain = 'FA'
print(f'Domain {domain}')
print(datetime.datetime.now().strftime('%Y-%m-%d - %H:%M:%S'))
start = datetime.datetime.now()

# Données sources
df_randomisation = pd.read_csv(f'./csv/source/randomisation.csv',delimiter=",")
df_randomisation = df_randomisation.loc[:, df_randomisation.columns!='redcap_repeat_instance'] # exclude column redcap_repeat_instance as randomisation form are unique
df_randomisation = df_randomisation.loc[:, df_randomisation.columns!='redcap_event_name'] # exclude column redcap_repeat_name as randomisation form are unique
df_randomisation = df_randomisation.loc[:, df_randomisation.columns!='ID'] # exclude column redcap_repeat_name as randomisation form are unique

df_clinical_exam = pd.read_csv(f'./csv/source/examen_clinique_symptomes.csv',delimiter=",")
df = pd.merge(df_randomisation,df_clinical_exam, left_on='pat_ide', right_on='pat_ide', how='outer').sort_values(by=['pat_ide','sym_dat']) 

# filtre sur un patient pour les tests
# df = df[(df['pat_ide'] == 'BFBO001')]

# print(df)

# Transformation des données sources : évènements en colonnes => évènements en lignes
df1 = pd.melt(df, id_vars=['pat_ide','sym_dat','redcap_event_name'], value_vars=['sym_tou', 'sym_gra','sym_ete','sym_fie','sym_fat','sym_dys','sym_mya','sym_cep','sym_gor','sym_cut','sym_ore','sym_nez','sym_odo','sym_gou','sym_ver','sym_vis','sym_aud','sym_tho','sym_pal','sym_nau','sym_vom','sym_dia','sym_abd','sym_mal','sym_con'],var_name='symptome', value_name='intensite')
df2 = pd.melt(df, id_vars=['pat_ide','sym_dat'], value_vars=['sym_tou_del','sym_gra_del','sym_ete_del','sym_fie_del','sym_fat_del','sym_dys_del','sym_mya_del','sym_cep_del','sym_gor_del','sym_cut_del','sym_ore_del','sym_nez_del','sym_odo_del','sym_gou_del','sym_ver_del','sym_vis_del','sym_aud_del','sym_tho_del','sym_pal_del','sym_nau_del','sym_vom_del','sym_dia_del','sym_abd_del','sym_mal_del','sym_con_del'],var_name='symptome', value_name='delai')
df2['symptome'] = df2.apply(lambda row: row['symptome'][:-4],axis=1)
# print(df1)
# print(df2)

# list of 25 monitored clinical signs
signs = {
    'sym_tou' : 'Dry cough',
    'sym_gra' : 'Fatty cough',
    'sym_ete' : 'Sneezing',
    'sym_fie' : 'Fever',
    'sym_fat' : 'Tiredness',
    'sym_dys' : 'Dyspnea',
    'sym_mya' : 'Muscle aches (Myalgia)',
    'sym_cep' : 'Headache', 
    'sym_gor' : 'Sore throat',
    'sym_cut' : 'Skin rash',
    'sym_ore' : 'Earache',
    'sym_nez' : 'Runny nose (Rhinorrhea)',
    'sym_odo' : 'Loss of smell (Anosmia)',
    'sym_gou' : 'Loss of taste (Ageusia)',
    'sym_ver' : 'Feeling dizzy',
    'sym_vis' : 'Acute vision impairment',
    'sym_aud' : 'Acute hearing impairment',
    'sym_tho' : 'Chest pain',
    'sym_pal' : 'Palpitations',
    'sym_nau' : 'Nausea',
    'sym_vom' : 'Vomiting',
    'sym_dia' : 'Diarrhea',
    'sym_abd' : 'Abdominal pain',
    'sym_mal' : 'Inexplicated tiredness',
    'sym_con' : 'Conjunctivitis'
    }


dataframe = pd.DataFrame()
dataframe = pd.merge(df1,df2, left_on=['pat_ide','sym_dat','symptome'], right_on=['pat_ide','sym_dat','symptome'], how='outer').fillna('NULL')
dataframe['symptome'] = dataframe.apply(lambda row: signs[row['symptome']],axis=1)
dataframe = dataframe.sort_values(by=['pat_ide','sym_dat'])
# print(dataframe)
# print(len(dataframe.index))

# FA
sdtm_variables = ['STUDYID','DOMAIN','USUBJID','FASEQ','FATESTCD','FATEST','FAOBJ','FACAT','FAORRES','FAORRESU','FASTRESC','FASTRESN','FASTAT','VISITNUM','VISIT','VISITDY','EPOCH','FADTC','FADY','SYM_DAT','DEL','pat_ide']
df_fa = pd.DataFrame(columns = sdtm_variables)
 
df_fa['pat_ide'] = dataframe.apply(lambda row: row['pat_ide'],axis=1)
df_fa['STUDYID'] = dataframe.apply(lambda row: '01-COV',axis=1)
df_fa['DOMAIN'] = dataframe.apply(lambda row: 'FA',axis=1)
df_fa['USUBJID'] = dataframe.apply(lambda row: Patient_icon_format(row['pat_ide']),axis=1)  # index of df
df_fa['FASEQ'] = dataframe.apply(lambda row: 1,axis=1) # pd.RangeIndex(stop=dataframe.shape[0]) + 1
df_fa['FATESTCD'] = dataframe.apply(lambda row:'OCCUR',axis=1)
df_fa['FATEST'] = dataframe.apply(lambda row:'Occurrence Indicator',axis=1)
df_fa['FAOBJ'] = dataframe.apply(lambda row: row['symptome'],axis=1)
df_fa['FACAT'] = dataframe.apply(lambda row: 'COVID-19 SYMPTOMS',axis=1)
df_fa['FAORRES'] = dataframe.apply(lambda row: Clinical_result(row['intensite']),axis=1)
df_fa['FAORRESU'] = dataframe.apply(lambda row: '',axis=1) 
df_fa['FASTRESC'] = dataframe.apply(lambda row: Clinical_standard_result(row['intensite']),axis=1)
df_fa['FASTRESN'] = dataframe.apply(lambda row: np.nan,axis=1)
df_fa['FASTAT'] = dataframe.apply(lambda row: 'NOT DONE' if Clinical_result(row['intensite']) == '' else '',axis=1)
df_fa['VISITNUM'] = dataframe.apply(lambda row: str(Visit_numbering(row['pat_ide'],row['sym_dat'],row['redcap_event_name'])),axis=1)
df_fa['VISIT'] = dataframe.apply(lambda row: Visit_name(row['pat_ide'],row['sym_dat'],row['redcap_event_name']),axis=1)
df_fa['VISITDY'] = dataframe.apply(lambda row: '-1' if Visit_numbering(row['pat_ide'],row['sym_dat'],row['redcap_event_name']) == 1 else str(round((Visit_numbering(row['pat_ide'],row['sym_dat'],row['redcap_event_name']) - 1))),axis=1)
df_fa['EPOCH'] = dataframe.apply(lambda row: Get_epoch(row['pat_ide'],row['sym_dat'],row['redcap_event_name']),axis=1)
df_fa['FADTC'] = dataframe.apply(lambda row: pd.to_datetime(row['sym_dat']).strftime('%Y-%m-%dT%H:%M:%S') if row['sym_dat'] != 'NULL' else None,axis=1)
df_fa['FADY'] = dataframe.apply(lambda row: Number_of_day_since_the_subject_start_the_study(dataframe,row['sym_dat'],Patient_study_start_date(row['pat_ide'])),axis=1)
df_fa['SYM_DAT'] = dataframe.apply(lambda row: row['sym_dat'],axis=1)
df_fa['DEL'] = dataframe.apply(lambda row: row['delai'],axis=1)

# add line numbers of event (based on pat_ide and date of event) => use of redacp_repeat_instance variable of the event form
df_fa = df_fa.sort_values(by=['pat_ide','FADTC'])
df_fa = Sequence(df_fa,'FASEQ','FADTC')
df_fa = df_fa.sort_values(by=['pat_ide','FADTC'])
print(df_fa)

# SUPPFA

# creation of empty dataframes 
# 2 dataframes because 2 'SUPP' variables are necessary for SUPPFA (FASTDTC et FAPRESP)
df_suppfa1 = pd.DataFrame(columns = ['STUDYID','RDOMAIN','USUBJID','IDVAR','IDVARVAL','QNAM','QLABEL','QVAL','QORIG','QEVAL','pat_ide'])
df_suppfa2 = pd.DataFrame(columns = ['STUDYID','RDOMAIN','USUBJID','IDVAR','IDVARVAL','QNAM','QLABEL','QVAL','QORIG','QEVAL','pat_ide'])

df_suppfa1['pat_ide'] = df_fa.apply(lambda row: row['pat_ide'],axis=1)
df_suppfa1['STUDYID'] = '01-COV'
df_suppfa1['RDOMAIN'] = 'FA'
df_suppfa1['USUBJID'] = df_fa.apply(lambda row: row['USUBJID'],axis=1)
df_suppfa1['IDVAR'] = 'FASEQ'
df_suppfa1['IDVARVAL'] = df_fa.apply(lambda row: row['FASEQ'],axis=1)
df_suppfa1['QNAM'] = 'FASTDTC'
df_suppfa1['QLABEL'] = 'COVID-19 Symptom Start Date'
df_suppfa1['QVAL'] = df_fa.apply(lambda row: (pd.to_datetime(row['SYM_DAT']) + timedelta(days = - int(row['DEL']))).strftime('%Y-%m-%d') if row['DEL'] != 'NULL' else row['SYM_DAT'],axis=1)
df_suppfa1['QORIG'] = 'CRF'
df_suppfa1['QEVAL'] = ''

df_suppfa2['pat_ide'] = df_fa.apply(lambda row: row['pat_ide'],axis=1)
df_suppfa2['STUDYID'] = '01-COV'
df_suppfa2['RDOMAIN'] = 'FA'
df_suppfa2['USUBJID'] = df_fa.apply(lambda row: row['USUBJID'],axis=1)
df_suppfa2['IDVAR'] = 'FASEQ'
df_suppfa2['IDVARVAL'] = df_fa.apply(lambda row: row['FASEQ'],axis=1)
df_suppfa2['QNAM'] = 'FAPRESP'
df_suppfa2['QLABEL'] = 'Prespecified'
df_suppfa2['QVAL'] = 'Y'
df_suppfa2['QORIG'] = 'CRF'
df_suppfa2['QEVAL'] = ''

dataframes = [df_suppfa1,df_suppfa2]
df_suppfa = pd.concat(dataframes)

df_suppfa = df_suppfa.drop(['pat_ide',], axis=1, errors='ignore')
# print(df_suppfa)


#-----------------------------------------------------------------------------------------------------------------------

sdtm_variables.remove('pat_ide')
sdtm_variables.remove('SYM_DAT')
sdtm_variables.remove('DEL')
# print(sdtm_variables)
df_fa = df_fa[sdtm_variables] 
df_fa = df_fa.replace('NULL','') 


# 3. Export data to csv
df_fa.to_csv(f'./xpt/{domain.lower()}_refactored.csv',index=False)
df_suppfa.to_csv(f'./xpt/supp{domain.lower()}_refactored.csv',index=False)

print(datetime.datetime.now().strftime('%Y-%m-%d - %H:%M:%S'))
print('Execution time',datetime.datetime.now() - start)

EDIT

normally, there 25 monitored clinical signs (and not only 4) and 2713 row in source dataframe


I have a Python code using the pandas library that works but is too slow because I use nested for-loops, and raw data become more important.

# initial dataframe: return 9 rows
df = pd.DataFrame([
        ['PAT001','2022-01-23',1,2,0,0,2,3,3,1],
        ['PAT001','2022-01-24',1,2,0,0,2,3,3,1],
        ['PAT001','2022-01-25',1,2,0,0,2,3,3,1],
        ['PAT002','2022-01-23',1,2,0,0,2,3,3,1],
        ['PAT002','2022-01-24',1,2,0,0,2,3,3,1],
        ['PAT002','2022-01-25',1,2,0,0,2,3,3,1],
        ['PAT002','2022-01-26',1,2,0,0,2,3,3,1],
    ], columns=['pat_ide','sym_dat','sym_tou','sym_gra','sym_ete','sym_fie','sym_tou_del','sym_gra_del','sym_ete_del','sym_fie_del'])

tmp_df_exams = pd.DataFrame(columns = ['pat_ide','DOMAIN','USUBJID','FAOBJ','SYM_DAT','DEL'])

# list of 4 monitored clinical signs
signs = {
    'sym_tou' : 'Dry cough',
    'sym_gra' : 'Fatty cough',
    'sym_ete' : 'Sneezing',
    'sym_fie' : 'Fever'
}

for index, row in df.iterrows():
    for k,v in signs.items(): 
        tmp_df_exams = tmp_df_exams.append({
            'pat_ide': row['pat_ide'],
            'DOMAIN' : 'FA',
            'USUBJID' : Patient_icon_format(row['pat_ide']),
            'FAOBJ' : v,
            'SYM_DAT' : row['sym_dat'],
            'DEL' : row[f'{k}_del']
        },ignore_index=True)

Currently, it takes hours to end on entire raw data. To have an idea, if I filter the initial dataframe (9 rows) it takes between 68s and 81s.

How can I improve performance?

OUTPUT

enter image description here

3
  • you can't ask us to help you then remove key parts of your code my dude. Where are k and v called ? Commented Jan 25, 2022 at 13:05
  • The only way for us to help you is to provide a description of the final result you aim to get. There is no solution to improve loops. The only method is to replace them with the pre-build method provided by pandas Commented Jan 25, 2022 at 13:43
  • OK, thanks to all for replying. I will try to give a sample test latter. My INPUT are csv files with clinical datas. My expected output are SDTM datasets (in csv format) based on input data, which imply formatting,and calculations. Commented Jan 25, 2022 at 14:01

1 Answer 1

2

Strategy : build a multilevel column index to more easily manipulate the data

  1. Create a mapping of tuples for the columns names

    columns_map = dict()
    for e in sorted(df.columns):
        e_header = e.replace('_del','')
        if e_header in signs:
            tuple = (signs[e_header], "del" if 'del' in e else 'sym')
            columns_map[e] = tuple
    
    >>> columns_map
    {'sym_ete': ('Sneezing', 'sym'),
     'sym_ete_del': ('Sneezing', 'del'),
     'sym_fie': ('Fever', 'sym'),
     'sym_fie_del': ('Fever', 'del'),
     'sym_gra': ('Fatty cough', 'sym'),
     'sym_gra_del': ('Fatty cough', 'del'),
     'sym_tou': ('Dry cough', 'sym'),
     'sym_tou_del': ('Dry cough', 'del')}
    
  2. Build the multilevel dataframe

    dfh = df.set_index(['pat_ide','sym_dat']).rename(columns = columns_map)
    dfh.columns = pd.MultiIndex.from_tuples(dfh.columns, names=['FAOBJ', 'data_name'])
    dfh = dfh.sort_index(axis=1)
    
    >>>dfh
    
    FAOBJ              Dry cough     Fatty cough     Fever     Sneezing    
    data_name            del sym         del sym   del sym      del sym
    pat_ide sym_dat                                                        
    PAT001  2022-01-23     2   1           3   2     1   0        3   0
    ...
    
  3. Data juggling

    dfh = (dfh.swaplevel(axis=1)                # put data_name level a the top
              .drop(['sym'], axis = 1)          # drop (unused data)
              .droplevel('data_name', axis=1)   # level 0 as now only 'del"
              .stack()                          # rows to column
              .rename('DEL', axis=1)            # rename the data
              .reset_index()                    # get rid ov the multilevel index
          )
    
  4. Inserting missing data ad tyding

    dfh = dfh.rename(columns={"sym_dat": "SYM_DAT"})
    dfh['DOMAIN'] = 'FA'
    dfh['USUBJID'] = '01-COV'+ dfh['pat_ide']     # mock up
    dfh = dfh.reindex(['pat_ide', 'DOMAIN', 'USUBJID', 'FAOBJ', 'SYM_DAT', 'DEL'], axis=1)
    

Result:

>>> dfh
   pat_ide DOMAIN       USUBJID        FAOBJ     SYM_DAT  DEL
0   PAT001     FA  01-COVPAT001    Dry cough  2022-01-23    2
1   PAT001     FA  01-COVPAT001  Fatty cough  2022-01-23    3
2   PAT001     FA  01-COVPAT001        Fever  2022-01-23    1
3   PAT001     FA  01-COVPAT001     Sneezing  2022-01-23    3
4   PAT001     FA  01-COVPAT001    Dry cough  2022-01-24    2
5   PAT001     FA  01-COVPAT001  Fatty cough  2022-01-24    3
6   PAT001     FA  01-COVPAT001        Fever  2022-01-24    1
7   PAT001     FA  01-COVPAT001     Sneezing  2022-01-24    3
8   PAT001     FA  01-COVPAT001    Dry cough  2022-01-25    2
9   PAT001     FA  01-COVPAT001  Fatty cough  2022-01-25    3
10  PAT001     FA  01-COVPAT001        Fever  2022-01-25    1
11  PAT001     FA  01-COVPAT001     Sneezing  2022-01-25    3
12  PAT002     FA  01-COVPAT002    Dry cough  2022-01-23    2
13  PAT002     FA  01-COVPAT002  Fatty cough  2022-01-23    3
14  PAT002     FA  01-COVPAT002        Fever  2022-01-23    1
15  PAT002     FA  01-COVPAT002     Sneezing  2022-01-23    3
16  PAT002     FA  01-COVPAT002    Dry cough  2022-01-24    2
17  PAT002     FA  01-COVPAT002  Fatty cough  2022-01-24    3
18  PAT002     FA  01-COVPAT002        Fever  2022-01-24    1
19  PAT002     FA  01-COVPAT002     Sneezing  2022-01-24    3
20  PAT002     FA  01-COVPAT002    Dry cough  2022-01-25    2
21  PAT002     FA  01-COVPAT002  Fatty cough  2022-01-25    3
22  PAT002     FA  01-COVPAT002        Fever  2022-01-25    1
23  PAT002     FA  01-COVPAT002     Sneezing  2022-01-25    3
24  PAT002     FA  01-COVPAT002    Dry cough  2022-01-26    2
25  PAT002     FA  01-COVPAT002  Fatty cough  2022-01-26    3
26  PAT002     FA  01-COVPAT002        Fever  2022-01-26    1
27  PAT002     FA  01-COVPAT002     Sneezing  2022-01-26    3
Sign up to request clarification or add additional context in comments.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.