From be210bb1754b3d37418e3205d613327491005cef Mon Sep 17 00:00:00 2001 From: Lilian Gasser <gasserli@ethz.ch> Date: Wed, 23 Jan 2019 16:06:53 +0100 Subject: [PATCH] cleaned, updated description, etc --- src/python/extractMPs.py | 127 ++++++++++++++++++++------------------- 1 file changed, 64 insertions(+), 63 deletions(-) diff --git a/src/python/extractMPs.py b/src/python/extractMPs.py index 49068153..75d39623 100644 --- a/src/python/extractMPs.py +++ b/src/python/extractMPs.py @@ -14,33 +14,36 @@ output_folder_dict = sys.argv[4] #'./data/politicians/lastnames/' class MPs_Extractor(object): - def __init__(self, years, input_file, output_file_csv, output_folder_dict, df_addInfo): + def __init__(self, years, input_file, input_file_addInfo, output_file_csv, output_folder_dict): self.input_file = input_file + self.input_file_addInfo = input_file_addInfo self.output_file_csv = output_file_csv self.output_folder_dict = output_folder_dict self.range_years = range(years[0], years[1] + 1) - self.df_addInfo = df_addInfo - # function to get lists of lastnames + # function to refine dataframe for name disambiguation # input: # - df_year: dataframe for a year # output: - # TODO: update - # - list_names: - # contains: - # - list of last names that appear only once and cannot be split - # - list of last name that are made up of two names such as 'Meier-Müller' - # for each double name, four entries are made: - # - ('Meier', 'Meier-Müller') - # - ('Müller', 'Meier-Müller') - # - ('Meier-Müller', 'Meier-Müller') - # - ('MeierMüller', 'Meier-Müller') - # - list for composite last names such as 'von Arx' or 'de Stoppani' - # will be saved as ('Arx', 'von Arx') - # - list for people with the same last names - # will be saved as (lastname, lastname (canton)) for each person - # if the name is a composite name: ('Arx', 'von Arx (canton)') - def get_list_of_lastnames(self, df_year, df_after1890): + # - df_year: updated dataframe with shortName, completeName and nameType + # - complete name: lastname firstname (canton cantonabbr) + # - if lastname appears multiple times + # nameType = 'canton' + # shortname = according to following three types + # - if lastname is a composite name such as 'von Arx' or 'de Stoppani' + # nameType = 'comp' + # shortName = Arx or Stoppani + # - if lastname is a double name such as 'Meier-Müller' + # nameType = 'double' + # for each double name, four entries are made: + # - shortName = 'Meier' + # - shortName = 'Müller' + # - shortName = 'Meier-Müller' + # - shortName = 'MeierMüller' + # - if lastname is none of the above + # nameType = 'simple' + # shortName = lastname + def refine_yearly_dataframe(self, df_year): str_simple = 'simple' str_double = 'double' str_comp = 'comp' @@ -48,31 +51,23 @@ class MPs_Extractor(object): # function to split lastname and save meaningful part(s) to list def split_lastname(df_year, lastname, uniqueID, str_completeName, bln_unique = True): - # if last name is a composite name, e.g. 'von Arx' and 'de Stoppani' - lastname_split = lastname.split() - if len(lastname_split) > 1: - for item in lastname_split: - if item not in ['von', 'de', 'Ab', 'van']: - # write distinctive item to extended list - df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'shortName'] = item - df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'completeName'] = str_completeName - if bln_unique: - df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'nameType'] = str_comp - else: - df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'nameType'] = str_canton - else: - # if last name is a double name, e.g. 'Meier-Müller' - lastname_split2 = lastname.replace('-', ' ').split() - if len(lastname_split2) > 1: - # set lastname and completename - df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'shortName'] = lastname - df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'completeName'] = str_completeName - # set nametype - if bln_unique: - df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'nameType'] = str_double - else: - df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'nameType'] = str_canton + def update_dataframe(df_year, str_shortName, str_completeName, str_nameType, bln_unique): + + # set short name + df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'shortName'] = str_shortName + + # set complete name + df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'completeName'] = str_completeName + + # set name type + if bln_unique: + df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'nameType'] = str_nameType + else: + df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'nameType'] = str_canton + + # generate three more entries for double names + if str_nameType == 'double': # duplicate this entry three times df_tripled = df_year[df_year['uniqueIndex'] == uniqueID] df_tripled = pd.concat([df_tripled]*3, ignore_index = True) @@ -88,13 +83,26 @@ class MPs_Extractor(object): # concatenate with yearly dataframe df_year = pd.concat([df_year, df_tripled], ignore_index = True) + return df_year + + # if last name is a composite name, e.g. 'von Arx' and 'de Stoppani' + lastname_split = lastname.split() + if len(lastname_split) > 1: + for item in lastname_split: + if item not in ['von', 'de', 'Ab', 'van']: + # update dataframe + df_year = update_dataframe(df_year, item, str_completeName, str_comp, bln_unique) + + else: + # if last name is a double name, e.g. 'Meier-Müller' + lastname_split2 = lastname.replace('-', ' ').split() + if len(lastname_split2) > 1: + # update dataframe + df_year = update_dataframe(df_year, lastname, str_completeName, str_double, bln_unique) + else: - df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'shortName'] = lastname - df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'completeName'] = str_completeName - if bln_unique: - df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'nameType'] = str_simple - else: - df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'nameType'] = str_canton + # update dataframe + df_year = update_dataframe(df_year, lastname, str_completeName, str_simple, bln_unique) return df_year @@ -175,7 +183,8 @@ class MPs_Extractor(object): _df_after1890.loc[list_index, 'uniqueIndex'] = list_index[0] # some people need additional information such as a place or a second last name to be uniquely identified - for row in self.df_addInfo.itertuples(index=False, name='Pandas'): + df_addInfo = pd.read_csv(self.input_file_addInfo) + for row in df_addInfo.itertuples(index=False, name='Pandas'): _df_after1890.loc[(_df_after1890['LastName'] == row[0]) & (_df_after1890['FirstName'] == row[1]), 'additionalInfo'] = row[3] # write dataframe to csv @@ -195,26 +204,18 @@ class MPs_Extractor(object): df_year = df_year.assign(shortName='') df_year = df_year.assign(completeName='') - # write df_year to a yearly csv file - # str_year = str(year) - # df_year.to_csv('home/lili/NLP_DemocraSci/nlp-democracy/output/MPs/MPs_' + str_year + '.csv') - - # create a pandas dataframe from list of names + # refine yearly dataframe for name disambiguation # !!! list contains errors, see definition of function - df_year = self.get_list_of_lastnames(df_year, _df_after1890) - print(df_year) + df_year = self.refine_yearly_dataframe(df_year) + #print(df_year) # dump dictionary of last names to a pickle file -# path = pathlib. with open(self.output_folder_dict + str(year) + "_lastnames.pickle", 'wb') as f: pickle.dump(df_year, f) # years of interest -years = [1891, 1893] #2016 - -# open additional info file -df_addInfo = pd.read_csv(input_file_addInfo) +years = [1891, 2016] #2016 -mps_extractor = MPs_Extractor(years, input_file, output_file_csv, output_folder_dict, df_addInfo) +mps_extractor = MPs_Extractor(years, input_file, input_file_addInfo, output_file_csv, output_folder_dict) mps_extractor.extract() -- GitLab