diff --git a/src/python/extractMPs.py b/src/python/extractMPs.py index ffb22273d02ed619cab3227a6eceb9fa19420d3d..5f36334c9c85da686cfd38e6cf610069281fd5e8 100644 --- a/src/python/extractMPs.py +++ b/src/python/extractMPs.py @@ -10,16 +10,18 @@ import sys input_file = sys.argv[1] #'./data/politicians/Ratsmitglieder_1848_DE_corr.xlsx' input_file_addInfo = sys.argv[2] #'./data/politicians/MPs_additionalInfo.csv' output_file_csv = sys.argv[3] #'./data/politicians/MPs_after1890.csv' -output_folder_dict = sys.argv[4] #'./data/politicians/lastnames/' +output_folder_pickle = sys.argv[4] #'./data/politicians/yearly_pickles/' +output_folder_csv = sys.argv[5] #'./data/politicians/yearly_csvs/' class MPs_Extractor(object): - def __init__(self, years, input_file, input_file_addInfo, output_file_csv, output_folder_dict): + def __init__(self, years, input_file, input_file_addInfo, output_file_csv, output_folder_pickle, output_folder_csv): self.input_file = input_file self.input_file_addInfo = input_file_addInfo self.output_file_csv = output_file_csv - self.output_folder_dict = output_folder_dict + self.output_folder_pickle = output_folder_pickle + self.output_folder_csv = output_folder_csv self.range_years = range(years[0], years[1] + 1) # function to refine dataframe for name disambiguation @@ -215,14 +217,19 @@ class MPs_Extractor(object): #print(df_year) # dump dataframe to a pickle file - if not os.path.exists(self.output_folder_dict): - os.makedirs(self.output_folder_dict ) - with open(self.output_folder_dict + str(year) + "_MPs.pickle", 'wb') as f: + if not os.path.exists(self.output_folder_pickle): + os.makedirs(self.output_folder_pickle) + with open(self.output_folder_pickle + str(year) + "_MPs.pickle", 'wb') as f: pickle.dump(df_year, f) + # write dataframe to csv + if not os.path.exists(self.output_folder_csv): + os.makedirs(self.output_folder_csv) + df_year.to_csv(self.output_folder_csv + str(year) + "_MPs.csv") + # years of interest years = [1891, 2016] #2016 -mps_extractor = MPs_Extractor(years, input_file, input_file_addInfo, output_file_csv, output_folder_dict) +mps_extractor = MPs_Extractor(years, input_file, input_file_addInfo, output_file_csv, output_folder_pickle, output_folder_csv) mps_extractor.extract() diff --git a/src/sh/extract_MPs.sh b/src/sh/extract_MPs.sh index a5de3bfd2df03f6735e4b76eabb8a6c47f8d67fb..4bb44bd9059bb9b82979a351233a3bc5247b199b 100755 --- a/src/sh/extract_MPs.sh +++ b/src/sh/extract_MPs.sh @@ -1,5 +1,10 @@ #!/bin/bash path_data=data/politicians/ +input_file=${path_data}Ratsmitglieder_1848_DE_corr.xlsx +input_file_addInfo=${path_data}MPs_additionalInfo.csv +output_file_csv=${path_data}MPs_after1890.csv +output_folder_pickle=${path_data}yearly_pickles/ +output_folder_csv=${path_data}yearly_csvs/ -renku run --isolation python src/python/extractMPs.py ${path_data}Ratsmitglieder_1848_DE_corr.xlsx ${path_data}MPs_additionalInfo.csv ${path_data}MPs_after1890.csv ${path_data}lastnames/ +renku run --isolation --output $output_file_csv --output $output_folder_pickle --output $output_folder_csv python src/python/extractMPs.py $input_file $input_file_addInfo $output_file_csv $output_folder_pickle $output_folder_csv