diff --git a/src/python/extractMPs.py b/src/python/extractMPs.py index e80f9257c9b22dcb45c98604cd3be67a850ebb49..4906815393b8856c88db90083b2bd2786c268a64 100644 --- a/src/python/extractMPs.py +++ b/src/python/extractMPs.py @@ -25,6 +25,7 @@ class MPs_Extractor(object): # input: # - df_year: dataframe for a year # output: + # TODO: update # - list_names: # contains: # - list of last names that appear only once and cannot be split @@ -43,11 +44,10 @@ class MPs_Extractor(object): str_simple = 'simple' str_double = 'double' str_comp = 'comp' - str_canton2 = 'canton' + str_canton = 'canton' # function to split lastname and save meaningful part(s) to list - def split_lastname(df_year, lastname, uniqueID, tpl_canton, str_completeName, bln_unique = True): - str_canton = 'tobedeleted' + def split_lastname(df_year, lastname, uniqueID, str_completeName, bln_unique = True): # if last name is a composite name, e.g. 'von Arx' and 'de Stoppani' lastname_split = lastname.split() if len(lastname_split) > 1: @@ -58,10 +58,8 @@ class MPs_Extractor(object): df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'completeName'] = str_completeName if bln_unique: df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'nameType'] = str_comp - list_names.append((str_comp, item, lastname, uniqueID) + tpl_canton) else: - df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'nameType'] = str_canton2 - list_names.append((str_canton2, item, str_canton, uniqueID) + tpl_canton) + df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'nameType'] = str_canton else: # if last name is a double name, e.g. 'Meier-Müller' lastname_split2 = lastname.replace('-', ' ').split() @@ -72,72 +70,44 @@ class MPs_Extractor(object): # set nametype if bln_unique: df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'nameType'] = str_double - list_names.append((str_double, lastname, lastname, uniqueID) + tpl_canton) else: - df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'nameType'] = str_canton2 - list_names.append((str_canton2, lastname, str_canton, uniqueID) + tpl_canton) + df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'nameType'] = str_canton + # duplicate this entry three times df_tripled = df_year[df_year['uniqueIndex'] == uniqueID] - print(df_tripled) df_tripled = pd.concat([df_tripled]*3, ignore_index = True) - print(df_tripled) - # set short name + # set short name without - i = 0 df_tripled.loc[i, 'shortName'] = ''.join(lastname.split('-')) + # and for each separate name for item in lastname_split2: i += 1 df_tripled.loc[i, 'shortName'] = item - print(df_tripled) + # concatenate with yearly dataframe df_year = pd.concat([df_year, df_tripled], ignore_index = True) - # write each part of double name into corresponding list - for item in lastname_split2: - if bln_unique: - list_names.append((str_double, item, lastname, uniqueID) + tpl_canton) - else: - list_names.append((str_canton2, item, str_canton, uniqueID) + tpl_canton) - # TODO: how to add double names to dataframe? create another entry??? - # write double name into list -# list_names.append((str_double, lastname, lastname, uniqueID) + tpl_canton) - # write double name without space into list - list_names.append((str_double, ''.join(lastname.split('-')), lastname, uniqueID) + tpl_canton) else: df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'shortName'] = lastname df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'completeName'] = str_completeName if bln_unique: df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'nameType'] = str_simple - list_names.append((str_simple, lastname, lastname, uniqueID) + tpl_canton) else: - df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'nameType'] = str_canton2 - list_names.append((str_canton2, lastname, str_canton, uniqueID) + tpl_canton) + df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'nameType'] = str_canton return df_year - # function to get canton and citizenship for uniqueID - def get_canton(df_year, uniqueID): - str_cantonname = df_year['CantonName'].loc[df_year['uniqueIndex']==uniqueID].iloc[0] - str_cantonabbr = df_year['CantonAbbreviation'].loc[df_year['uniqueIndex']==uniqueID].iloc[0] - str_citizenship = df_year['Citizenship'].loc[df_year['uniqueIndex']==uniqueID].iloc[0] - str_firstname = df_year['FirstName'].loc[df_year['uniqueIndex']==uniqueID].iloc[0] - str_addInfo = df_year['additionalInfo'].loc[df_year['uniqueIndex']==uniqueID].iloc[0] - - return (str_cantonname, str_cantonabbr, str_citizenship, str_firstname, str_addInfo) - def _get_complete_name(df_year, uniqueID): - str_lastName = df_year['LastName'].loc[df_year['uniqueIndex']==uniqueID].iloc[0] - str_firstName = df_year['FirstName'].loc[df_year['uniqueIndex']==uniqueID].iloc[0] - str_cantonName = df_year['CantonName'].loc[df_year['uniqueIndex']==uniqueID].iloc[0] - str_cantonAbbr = df_year['CantonAbbreviation'].loc[df_year['uniqueIndex']==uniqueID].iloc[0] - str_canton = '(' + ' '.join((str_cantonName, str_cantonAbbr)) + ')' - str_completeName = ' '.join((str_lastName, str_firstName, str_canton)) + _str_lastName = df_year['LastName'].loc[df_year['uniqueIndex']==uniqueID].iloc[0] + _str_firstName = df_year['FirstName'].loc[df_year['uniqueIndex']==uniqueID].iloc[0] + _str_cantonName = df_year['CantonName'].loc[df_year['uniqueIndex']==uniqueID].iloc[0] + _str_cantonAbbr = df_year['CantonAbbreviation'].loc[df_year['uniqueIndex']==uniqueID].iloc[0] + _str_canton = '(' + ' '.join((_str_cantonName, _str_cantonAbbr)) + ')' + str_completeName = ' '.join((_str_lastName, _str_firstName, _str_canton)) return str_completeName - # create empty lists for last names - list_names = [] - # for every last name for lastname in df_year['LastName'].drop_duplicates(): #print('name', lastname, type(lastname)) @@ -151,14 +121,11 @@ class MPs_Extractor(object): # extract unique index uniqueID = df_temp.iloc[0]['uniqueIndex'] - # get canton information for that uniqueID - tpl_canton = get_canton(df_year, uniqueID) - # get complete name str_completeName = _get_complete_name(df_year, uniqueID) # write complete name to list of last names - df_year = split_lastname(df_year, lastname, uniqueID, tpl_canton, str_completeName, bln_unique = True) + df_year = split_lastname(df_year, lastname, uniqueID, str_completeName, bln_unique = True) # if there are several people with the same last name else: @@ -167,16 +134,13 @@ class MPs_Extractor(object): # extract unique index uniqueID = df_temp.loc[idx]['uniqueIndex'] - # get canton information for that uniqueID - tpl_canton = get_canton(df_year, uniqueID) - # get complete name str_completeName = _get_complete_name(df_year, uniqueID) # write the lastname to the list - df_year = split_lastname(df_year, lastname, uniqueID, tpl_canton, str_completeName, bln_unique = False) + df_year = split_lastname(df_year, lastname, uniqueID, str_completeName, bln_unique = False) - return list_names, df_year + return df_year def extract(self): # read excel file and save first sheet to a dataframe @@ -222,14 +186,14 @@ class MPs_Extractor(object): # extract every MP that was active in that year # (every MP of a year joined before the end of the and left after the beginning of the year) - _df_year = _df_after1890[pd.to_datetime(_df_after1890['DateJoining']) <= datetime.datetime(year, 12, 31, 0, 0)] - _df_year = _df_year[pd.to_datetime(_df_year['DateLeaving']) >= datetime.datetime(year, 1, 1, 0, 0)] - print(year, _df_year.shape) + df_year = _df_after1890[pd.to_datetime(_df_after1890['DateJoining']) <= datetime.datetime(year, 12, 31, 0, 0)] + df_year = df_year[pd.to_datetime(df_year['DateLeaving']) >= datetime.datetime(year, 1, 1, 0, 0)] + print(year, df_year.shape) # generate new column for name type and short and complete name - _df_year = _df_year.assign(nameType='') - _df_year = _df_year.assign(shortName='') - _df_year = _df_year.assign(completeName='') + df_year = df_year.assign(nameType='') + df_year = df_year.assign(shortName='') + df_year = df_year.assign(completeName='') # write df_year to a yearly csv file # str_year = str(year) @@ -237,15 +201,13 @@ class MPs_Extractor(object): # create a pandas dataframe from list of names # !!! list contains errors, see definition of function - _list_lastnames, _df_year = self.get_list_of_lastnames(_df_year, _df_after1890) - df_lastnames = pd.DataFrame(_list_lastnames, columns = ('type', 'name_short', 'name_correct', 'uniqueIndex', 'CantonName', 'CantonAbbreviation', 'Citizenship', 'FirstName', 'additionalInfo')) - print(df_lastnames) - print(_df_year) + df_year = self.get_list_of_lastnames(df_year, _df_after1890) + print(df_year) # dump dictionary of last names to a pickle file # path = pathlib. with open(self.output_folder_dict + str(year) + "_lastnames.pickle", 'wb') as f: - pickle.dump(df_lastnames, f) + pickle.dump(df_year, f) # years of interest @@ -256,21 +218,3 @@ df_addInfo = pd.read_csv(input_file_addInfo) mps_extractor = MPs_Extractor(years, input_file, output_file_csv, output_folder_dict, df_addInfo) mps_extractor.extract() - - -#%% -import pandas as pd -input_file_addInfo = './data/politicians/MPs_additionalInfo.csv' -df_addInfo = pd.read_csv(input_file_addInfo) -df_addInfo -df_new = df_addInfo[df_addInfo['LastName']=='Blumer'] -df_new = pd.concat([df_new]*3, ignore_index=True) -df_new - - -df_new.loc[0, 'Additional'] = 'first' -df_new.loc[1, 'Additional'] = 'second' -df_new.loc[2, 'Additional'] = 'both' -df_new - -pd.concat([df_addInfo, df_new], ignore_index = True)