Skip to content
Snippets Groups Projects
Commit be210bb1 authored by Lili Gasser's avatar Lili Gasser
Browse files

cleaned, updated description, etc

parent f36c26c4
No related branches found
No related tags found
No related merge requests found
......@@ -14,33 +14,36 @@ output_folder_dict = sys.argv[4] #'./data/politicians/lastnames/'
class MPs_Extractor(object):
def __init__(self, years, input_file, output_file_csv, output_folder_dict, df_addInfo):
def __init__(self, years, input_file, input_file_addInfo, output_file_csv, output_folder_dict):
self.input_file = input_file
self.input_file_addInfo = input_file_addInfo
self.output_file_csv = output_file_csv
self.output_folder_dict = output_folder_dict
self.range_years = range(years[0], years[1] + 1)
self.df_addInfo = df_addInfo
# function to get lists of lastnames
# function to refine dataframe for name disambiguation
# input:
# - df_year: dataframe for a year
# output:
# TODO: update
# - list_names:
# contains:
# - list of last names that appear only once and cannot be split
# - list of last name that are made up of two names such as 'Meier-Müller'
# for each double name, four entries are made:
# - ('Meier', 'Meier-Müller')
# - ('Müller', 'Meier-Müller')
# - ('Meier-Müller', 'Meier-Müller')
# - ('MeierMüller', 'Meier-Müller')
# - list for composite last names such as 'von Arx' or 'de Stoppani'
# will be saved as ('Arx', 'von Arx')
# - list for people with the same last names
# will be saved as (lastname, lastname (canton)) for each person
# if the name is a composite name: ('Arx', 'von Arx (canton)')
def get_list_of_lastnames(self, df_year, df_after1890):
# - df_year: updated dataframe with shortName, completeName and nameType
# - complete name: lastname firstname (canton cantonabbr)
# - if lastname appears multiple times
# nameType = 'canton'
# shortname = according to following three types
# - if lastname is a composite name such as 'von Arx' or 'de Stoppani'
# nameType = 'comp'
# shortName = Arx or Stoppani
# - if lastname is a double name such as 'Meier-Müller'
# nameType = 'double'
# for each double name, four entries are made:
# - shortName = 'Meier'
# - shortName = 'Müller'
# - shortName = 'Meier-Müller'
# - shortName = 'MeierMüller'
# - if lastname is none of the above
# nameType = 'simple'
# shortName = lastname
def refine_yearly_dataframe(self, df_year):
str_simple = 'simple'
str_double = 'double'
str_comp = 'comp'
......@@ -48,31 +51,23 @@ class MPs_Extractor(object):
# function to split lastname and save meaningful part(s) to list
def split_lastname(df_year, lastname, uniqueID, str_completeName, bln_unique = True):
# if last name is a composite name, e.g. 'von Arx' and 'de Stoppani'
lastname_split = lastname.split()
if len(lastname_split) > 1:
for item in lastname_split:
if item not in ['von', 'de', 'Ab', 'van']:
# write distinctive item to extended list
df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'shortName'] = item
df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'completeName'] = str_completeName
if bln_unique:
df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'nameType'] = str_comp
else:
df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'nameType'] = str_canton
else:
# if last name is a double name, e.g. 'Meier-Müller'
lastname_split2 = lastname.replace('-', ' ').split()
if len(lastname_split2) > 1:
# set lastname and completename
df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'shortName'] = lastname
df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'completeName'] = str_completeName
# set nametype
if bln_unique:
df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'nameType'] = str_double
else:
df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'nameType'] = str_canton
def update_dataframe(df_year, str_shortName, str_completeName, str_nameType, bln_unique):
# set short name
df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'shortName'] = str_shortName
# set complete name
df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'completeName'] = str_completeName
# set name type
if bln_unique:
df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'nameType'] = str_nameType
else:
df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'nameType'] = str_canton
# generate three more entries for double names
if str_nameType == 'double':
# duplicate this entry three times
df_tripled = df_year[df_year['uniqueIndex'] == uniqueID]
df_tripled = pd.concat([df_tripled]*3, ignore_index = True)
......@@ -88,13 +83,26 @@ class MPs_Extractor(object):
# concatenate with yearly dataframe
df_year = pd.concat([df_year, df_tripled], ignore_index = True)
return df_year
# if last name is a composite name, e.g. 'von Arx' and 'de Stoppani'
lastname_split = lastname.split()
if len(lastname_split) > 1:
for item in lastname_split:
if item not in ['von', 'de', 'Ab', 'van']:
# update dataframe
df_year = update_dataframe(df_year, item, str_completeName, str_comp, bln_unique)
else:
# if last name is a double name, e.g. 'Meier-Müller'
lastname_split2 = lastname.replace('-', ' ').split()
if len(lastname_split2) > 1:
# update dataframe
df_year = update_dataframe(df_year, lastname, str_completeName, str_double, bln_unique)
else:
df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'shortName'] = lastname
df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'completeName'] = str_completeName
if bln_unique:
df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'nameType'] = str_simple
else:
df_year.loc[(df_year['uniqueIndex'] == uniqueID), 'nameType'] = str_canton
# update dataframe
df_year = update_dataframe(df_year, lastname, str_completeName, str_simple, bln_unique)
return df_year
......@@ -175,7 +183,8 @@ class MPs_Extractor(object):
_df_after1890.loc[list_index, 'uniqueIndex'] = list_index[0]
# some people need additional information such as a place or a second last name to be uniquely identified
for row in self.df_addInfo.itertuples(index=False, name='Pandas'):
df_addInfo = pd.read_csv(self.input_file_addInfo)
for row in df_addInfo.itertuples(index=False, name='Pandas'):
_df_after1890.loc[(_df_after1890['LastName'] == row[0]) & (_df_after1890['FirstName'] == row[1]), 'additionalInfo'] = row[3]
# write dataframe to csv
......@@ -195,26 +204,18 @@ class MPs_Extractor(object):
df_year = df_year.assign(shortName='')
df_year = df_year.assign(completeName='')
# write df_year to a yearly csv file
# str_year = str(year)
# df_year.to_csv('home/lili/NLP_DemocraSci/nlp-democracy/output/MPs/MPs_' + str_year + '.csv')
# create a pandas dataframe from list of names
# refine yearly dataframe for name disambiguation
# !!! list contains errors, see definition of function
df_year = self.get_list_of_lastnames(df_year, _df_after1890)
print(df_year)
df_year = self.refine_yearly_dataframe(df_year)
#print(df_year)
# dump dictionary of last names to a pickle file
# path = pathlib.
with open(self.output_folder_dict + str(year) + "_lastnames.pickle", 'wb') as f:
pickle.dump(df_year, f)
# years of interest
years = [1891, 1893] #2016
# open additional info file
df_addInfo = pd.read_csv(input_file_addInfo)
years = [1891, 2016] #2016
mps_extractor = MPs_Extractor(years, input_file, output_file_csv, output_folder_dict, df_addInfo)
mps_extractor = MPs_Extractor(years, input_file, input_file_addInfo, output_file_csv, output_folder_dict)
mps_extractor.extract()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment