diff --git a/src/python/extractMPs.py b/src/python/extractMPs.py index c1b600b4df6665cbeac07a3d10be55bbee6cb3e3..46400f178dfa7f4c3c674aff65a2bc669cbfd1e6 100644 --- a/src/python/extractMPs.py +++ b/src/python/extractMPs.py @@ -14,12 +14,12 @@ output_folder_dict = sys.argv[4] #'./data/politicians/lastnames/' class MPs_Extractor(object): - def __init__(self, years, input_file, output_file_csv, output_folder_dict, df_exc): + def __init__(self, years, input_file, output_file_csv, output_folder_dict, df_addInfo): self.input_file = input_file self.output_file_csv = output_file_csv self.output_folder_dict = output_folder_dict self.range_years = range(years[0], years[1] + 1) - self.df_exc = df_exc + self.df_addInfo = df_addInfo # function to get lists of lastnames # input: @@ -46,7 +46,7 @@ class MPs_Extractor(object): str_canton2 = 'canton' # function to split lastname and save meaningful part(s) to list - def split_lastname(lastname, uniqueID, tpl_canton, str_canton = ''): + def split_lastname(lastname, uniqueID, tpl_canton, str_addInfo, str_canton = ''): # if last name is a composite name, e.g. 'von Arx' and 'de Stoppani' lastname_split = lastname.split() if len(lastname_split) > 1: @@ -83,9 +83,9 @@ class MPs_Extractor(object): str_cantonabbr = df_year['CantonAbbreviation'].loc[df_year['uniqueIndex']==uniqueID].iloc[0] str_citizenship = df_year['Citizenship'].loc[df_year['uniqueIndex']==uniqueID].iloc[0] str_firstname = df_year['FirstName'].loc[df_year['uniqueIndex']==uniqueID].iloc[0] - str_doublename = df_year['DoubleName'].loc[df_year['uniqueIndex']==uniqueID].iloc[0] + str_addInfo = df_year['additionalInfo'].loc[df_year['uniqueIndex']==uniqueID].iloc[0] - return (str_cantonname, str_cantonabbr, str_citizenship, str_firstname, str_doublename) + return (str_cantonname, str_cantonabbr, str_citizenship, str_firstname, str_addInfo) # create empty lists for last names list_names = [] @@ -100,28 +100,29 @@ class MPs_Extractor(object): # if there is an extra double name # TODO: maybe easier by just adding second part as additional term, could then also be used to account for Wohnort - if df_temp.iloc[0]['DoubleName'] != '' and isinstance(df_temp.iloc[0]['DoubleName'], str): - # extract unique index - uniqueID = df_temp.iloc[0]['uniqueIndex'] - - # get canton information for that uniqueID - tpl_canton = get_canton(df_year, uniqueID) - - #print('double name', df_temp) - doublename = df_temp.iloc[0]['DoubleName'] - print(doublename) - - # if last name is a double name, e.g. 'Meier-Müller' - lastname_split2 = doublename.replace('-', ' ').split() - if len(lastname_split2) > 1: - # write each part of double name into corresponding list - for item in lastname_split2: - list_names.append((str_double, item, lastname, uniqueID) + tpl_canton) - # write double name into list - list_names.append((str_double, doublename, lastname, uniqueID) + tpl_canton) - # write double name without space into list - list_names.append((str_double, ''.join(doublename.split('-')), lastname, uniqueID) + tpl_canton) - + str_addInfo = df_temp.iloc[0]['additionalInfo'] +# if df_temp.iloc[0]['additionalInfo'] != '': + ## extract unique index + #uniqueID = df_temp.iloc[0]['uniqueIndex'] +# + ## get canton information for that uniqueID + #tpl_canton = get_canton(df_year, uniqueID) +# + ##print('double name', df_temp) + #doublename = df_temp.iloc[0]['DoubleName'] + #print(doublename) +# + ## if last name is a double name, e.g. 'Meier-Müller' + #lastname_split2 = doublename.replace('-', ' ').split() + #if len(lastname_split2) > 1: + ## write each part of double name into corresponding list + #for item in lastname_split2: + #list_names.append((str_double, item, lastname, uniqueID) + tpl_canton) + ## write double name into list + #list_names.append((str_double, doublename, lastname, uniqueID) + tpl_canton) + ## write double name without space into list + #list_names.append((str_double, ''.join(doublename.split('-')), lastname, uniqueID) + tpl_canton) +# # if only one person with that last name if df_temp.drop_duplicates(['uniqueIndex']).shape[0] == 1: # extract unique index @@ -131,7 +132,7 @@ class MPs_Extractor(object): tpl_canton = get_canton(df_year, uniqueID) # write complete name to list of last names - split_lastname(lastname, uniqueID, tpl_canton) + split_lastname(lastname, uniqueID, tpl_canton, str_addInfo) # if there are several people with the same last name else: @@ -144,58 +145,57 @@ class MPs_Extractor(object): tpl_canton = get_canton(df_year, uniqueID) # write the lastname to the list - split_lastname(lastname, uniqueID, tpl_canton, row['LastName'] + ' (' + row['CantonName'] + ')') + split_lastname(lastname, uniqueID, tpl_canton, str_addInfo, row['LastName'] + ' (' + row['CantonName'] + ')') return list_names def extract(self): # read excel file and save first sheet to a dataframe - xl = pd.ExcelFile(self.input_file) - str_sheetname = xl.sheet_names[0] - orddict = xl.parse([str_sheetname]) - df = orddict[str_sheetname] + _xl = pd.ExcelFile(self.input_file) + _str_sheetname = _xl.sheet_names[0] + _orddict = _xl.parse([_str_sheetname]) + _df = _orddict[_str_sheetname] # drop duplicate entries - df = df.drop_duplicates() + _df = _df.drop_duplicates() # extract all people participating after 1890 # starting from 1891 - df1 = df[pd.to_datetime(df['DateLeaving']) > datetime.datetime(1890, 12, 31)] + _df1 = _df[pd.to_datetime(_df['DateLeaving']) > datetime.datetime(1890, 12, 31)] # get rid of people with wrong DateLeaving that is encoded as 1899-12-30 00:00:00 instead of dd.mm.yyyy - df1 = df1[df1['DateLeaving'].str.len() == 10] + _df1 = _df1[_df1['DateLeaving'].str.len() == 10] # current members - df2 = df[df['DateLeaving'].isnull()] + _df2 = _df[_df['DateLeaving'].isnull()] # combine these dataframes - df_after1890 = pd.concat([df1, df2]) + _df_after1890 = pd.concat([_df1, _df2]) # generate unique ID for every person # generate two now columns - df_after1890 = df_after1890.assign(uniqueIndex=0) - df_after1890 = df_after1890.assign(DoubleName='') + _df_after1890 = _df_after1890.assign(uniqueIndex=0) + _df_after1890 = _df_after1890.assign(additionalInfo='') # group by first and last name, and date of birth - grouped = df_after1890.groupby(["LastName", "FirstName", "DateOfBirth"]) + _grouped = _df_after1890.groupby(["LastName", "FirstName", "DateOfBirth"]) # assign first index to all entries of a person - for list_index in grouped.groups.values(): - df_after1890.loc[list_index, 'uniqueIndex'] = list_index[0] + for list_index in _grouped.groups.values(): + _df_after1890.loc[list_index, 'uniqueIndex'] = list_index[0] - # some people are referred to by their double-name, add these double-namse to extra-column - for row in self.df_exc.itertuples(index=False, name='Pandas'): - df_after1890.loc[(df_after1890['LastName'] == row[0]) & (df_after1890['FirstName'] == row[1]), 'DoubleName'] = row[2] + # some people need additional information such as a place or a second last name to be uniquely languageNotIdentified + for row in self.df_addInfo.itertuples(index=False, name='Pandas'): + _df_after1890.loc[(_df_after1890['LastName'] == row[0]) & (_df_after1890['FirstName'] == row[1]), 'additionalInfo'] = row[3] -# print(df_after1890.loc[df_after1890['DoubleName'] != '']) # write dataframe to csv - df_after1890.to_csv(self.output_file_csv) + _df_after1890.to_csv(self.output_file_csv) # for every year for year in self.range_years: # extract every MP that was active in that year # (every MP of a year joined before the end of the and left after the beginning of the year) - df_year = df_after1890[pd.to_datetime(df_after1890['DateJoining']) <= datetime.datetime(year, 12, 31, 0, 0)] - df_year = df_year[pd.to_datetime(df_year['DateLeaving']) >= datetime.datetime(year, 1, 1, 0, 0)] - print(year, df_year.shape) + _df_year = _df_after1890[pd.to_datetime(_df_after1890['DateJoining']) <= datetime.datetime(year, 12, 31, 0, 0)] + _df_year = _df_year[pd.to_datetime(_df_year['DateLeaving']) >= datetime.datetime(year, 1, 1, 0, 0)] + print(year, _df_year.shape) # write df_year to a yearly csv file # str_year = str(year) @@ -203,8 +203,9 @@ class MPs_Extractor(object): # create a pandas dataframe from list of names # !!! list contains errors, see definition of function - list_lastnames = self.get_list_of_lastnames(df_year, df_after1890) - df_lastnames = pd.DataFrame(list_lastnames, columns = ('type', 'name_short', 'name_correct', 'uniqueIndex', 'CantonName', 'CantonAbbreviation', 'Citizenship', 'FirstName', 'DoubleName')) + _list_lastnames = self.get_list_of_lastnames(_df_year, _df_after1890) + df_lastnames = pd.DataFrame(_list_lastnames, columns = ('type', 'name_short', 'name_correct', 'uniqueIndex', 'CantonName', 'CantonAbbreviation', 'Citizenship', 'FirstName', 'additionalInfo')) + print(df_lastnames) # dump dictionary of last names to a pickle file # path = pathlib.