From bc1e486f4bfe628fa24a5e26d01b81de21b6bea9 Mon Sep 17 00:00:00 2001
From: Lilian Gasser <gasserli@ethz.ch>
Date: Wed, 23 Jan 2019 13:11:03 +0100
Subject: [PATCH] add additional info to df

---
 src/python/extractMPs.py | 107 ++++++++++++++++++++-------------------
 1 file changed, 54 insertions(+), 53 deletions(-)

diff --git a/src/python/extractMPs.py b/src/python/extractMPs.py
index c1b600b4..46400f17 100644
--- a/src/python/extractMPs.py
+++ b/src/python/extractMPs.py
@@ -14,12 +14,12 @@ output_folder_dict = sys.argv[4]     #'./data/politicians/lastnames/'
 
 class MPs_Extractor(object):
 
-    def __init__(self, years, input_file, output_file_csv, output_folder_dict, df_exc):
+    def __init__(self, years, input_file, output_file_csv, output_folder_dict, df_addInfo):
         self.input_file = input_file
         self.output_file_csv = output_file_csv
         self.output_folder_dict = output_folder_dict
         self.range_years = range(years[0], years[1] + 1)
-        self.df_exc = df_exc
+        self.df_addInfo = df_addInfo
 
     # function to get lists of lastnames
     # input:
@@ -46,7 +46,7 @@ class MPs_Extractor(object):
         str_canton2 = 'canton'
 
     	# function to split lastname and save meaningful part(s) to list
-        def split_lastname(lastname, uniqueID, tpl_canton, str_canton = ''):
+        def split_lastname(lastname, uniqueID, tpl_canton, str_addInfo, str_canton = ''):
     	    # if last name is a composite name, e.g. 'von Arx' and 'de Stoppani'
             lastname_split = lastname.split()
             if len(lastname_split) > 1:
@@ -83,9 +83,9 @@ class MPs_Extractor(object):
             str_cantonabbr = df_year['CantonAbbreviation'].loc[df_year['uniqueIndex']==uniqueID].iloc[0]
             str_citizenship = df_year['Citizenship'].loc[df_year['uniqueIndex']==uniqueID].iloc[0]
             str_firstname = df_year['FirstName'].loc[df_year['uniqueIndex']==uniqueID].iloc[0]
-            str_doublename = df_year['DoubleName'].loc[df_year['uniqueIndex']==uniqueID].iloc[0]
+            str_addInfo = df_year['additionalInfo'].loc[df_year['uniqueIndex']==uniqueID].iloc[0]
 
-            return (str_cantonname, str_cantonabbr, str_citizenship, str_firstname, str_doublename)
+            return (str_cantonname, str_cantonabbr, str_citizenship, str_firstname, str_addInfo)
 
     	# create empty lists for last names
         list_names = []
@@ -100,28 +100,29 @@ class MPs_Extractor(object):
 
     	    # if there is an extra double name
             # TODO: maybe easier by just adding second part as additional term, could then also be used to account for Wohnort
-            if df_temp.iloc[0]['DoubleName'] != '' and isinstance(df_temp.iloc[0]['DoubleName'], str):
-    		# extract unique index
-                uniqueID = df_temp.iloc[0]['uniqueIndex']
-
-                # get canton information for that uniqueID
-                tpl_canton = get_canton(df_year, uniqueID)
-
-                #print('double name', df_temp)
-                doublename = df_temp.iloc[0]['DoubleName']
-                print(doublename)
-
-        		# if last name is a double name, e.g. 'Meier-MÃ¼ller'
-                lastname_split2 = doublename.replace('-', ' ').split()
-                if len(lastname_split2) > 1:
-        		    # write each part of double name into corresponding list
-                    for item in lastname_split2:
-                        list_names.append((str_double, item, lastname, uniqueID) + tpl_canton)
-        		    # write double name into list
-                    list_names.append((str_double, doublename, lastname, uniqueID) + tpl_canton)
-        		    # write double name without space into list
-                    list_names.append((str_double, ''.join(doublename.split('-')), lastname, uniqueID) + tpl_canton)
-
+            str_addInfo = df_temp.iloc[0]['additionalInfo']
+#            if df_temp.iloc[0]['additionalInfo'] != '':
+        		## extract unique index
+                #uniqueID = df_temp.iloc[0]['uniqueIndex']
+#
+                ## get canton information for that uniqueID
+                #tpl_canton = get_canton(df_year, uniqueID)
+#
+                ##print('double name', df_temp)
+                #doublename = df_temp.iloc[0]['DoubleName']
+                #print(doublename)
+#
+        		## if last name is a double name, e.g. 'Meier-MÃ¼ller'
+                #lastname_split2 = doublename.replace('-', ' ').split()
+                #if len(lastname_split2) > 1:
+        		    ## write each part of double name into corresponding list
+                    #for item in lastname_split2:
+                        #list_names.append((str_double, item, lastname, uniqueID) + tpl_canton)
+        		    ## write double name into list
+                    #list_names.append((str_double, doublename, lastname, uniqueID) + tpl_canton)
+        		    ## write double name without space into list
+                    #list_names.append((str_double, ''.join(doublename.split('-')), lastname, uniqueID) + tpl_canton)
+#
     	    # if only one person with that last name
             if df_temp.drop_duplicates(['uniqueIndex']).shape[0] == 1:
     		# extract unique index
@@ -131,7 +132,7 @@ class MPs_Extractor(object):
                 tpl_canton = get_canton(df_year, uniqueID)
 
         		# write complete name to list of last names
-                split_lastname(lastname, uniqueID, tpl_canton)
+                split_lastname(lastname, uniqueID, tpl_canton, str_addInfo)
 
     	    # if there are several people with the same last name
             else:
@@ -144,58 +145,57 @@ class MPs_Extractor(object):
                     tpl_canton = get_canton(df_year, uniqueID)
 
         		    # write the lastname to the list
-                    split_lastname(lastname, uniqueID, tpl_canton, row['LastName'] + ' (' + row['CantonName'] + ')')
+                    split_lastname(lastname, uniqueID, tpl_canton, str_addInfo, row['LastName'] + ' (' + row['CantonName'] + ')')
 
         return list_names
 
     def extract(self):
         # read excel file and save first sheet to a dataframe
-        xl = pd.ExcelFile(self.input_file)
-        str_sheetname = xl.sheet_names[0]
-        orddict = xl.parse([str_sheetname])
-        df = orddict[str_sheetname]
+        _xl = pd.ExcelFile(self.input_file)
+        _str_sheetname = _xl.sheet_names[0]
+        _orddict = _xl.parse([_str_sheetname])
+        _df = _orddict[_str_sheetname]
 
         # drop duplicate entries
-        df = df.drop_duplicates()
+        _df = _df.drop_duplicates()
 
         # extract all people participating after 1890
         # starting from 1891
-        df1 = df[pd.to_datetime(df['DateLeaving']) > datetime.datetime(1890, 12, 31)]
+        _df1 = _df[pd.to_datetime(_df['DateLeaving']) > datetime.datetime(1890, 12, 31)]
         # get rid of people with wrong DateLeaving that is encoded as 1899-12-30 00:00:00 instead of dd.mm.yyyy
-        df1 = df1[df1['DateLeaving'].str.len() == 10]
+        _df1 = _df1[_df1['DateLeaving'].str.len() == 10]
         # current members
-        df2 = df[df['DateLeaving'].isnull()]
+        _df2 = _df[_df['DateLeaving'].isnull()]
         # combine these dataframes
-        df_after1890 = pd.concat([df1, df2])
+        _df_after1890 = pd.concat([_df1, _df2])
 
         # generate unique ID for every person
         # generate two now columns
-        df_after1890 = df_after1890.assign(uniqueIndex=0)
-        df_after1890 = df_after1890.assign(DoubleName='')
+        _df_after1890 = _df_after1890.assign(uniqueIndex=0)
+        _df_after1890 = _df_after1890.assign(additionalInfo='')
 
         # group by first and last name, and date of birth
-        grouped = df_after1890.groupby(["LastName", "FirstName", "DateOfBirth"])
+        _grouped = _df_after1890.groupby(["LastName", "FirstName", "DateOfBirth"])
 
         # assign first index to all entries of a person
-        for list_index in grouped.groups.values():
-            df_after1890.loc[list_index, 'uniqueIndex'] = list_index[0]
+        for list_index in _grouped.groups.values():
+            _df_after1890.loc[list_index, 'uniqueIndex'] = list_index[0]
 
-        # some people are referred to by their double-name, add these double-namse to extra-column
-        for row in self.df_exc.itertuples(index=False, name='Pandas'):
-            df_after1890.loc[(df_after1890['LastName'] == row[0]) & (df_after1890['FirstName'] == row[1]), 'DoubleName'] = row[2]
+        # some people need additional information such as a place or a second last name to be uniquely languageNotIdentified
+        for row in self.df_addInfo.itertuples(index=False, name='Pandas'):
+            _df_after1890.loc[(_df_after1890['LastName'] == row[0]) & (_df_after1890['FirstName'] == row[1]), 'additionalInfo'] = row[3]
 
-#        print(df_after1890.loc[df_after1890['DoubleName'] != ''])
         # write dataframe to csv
-        df_after1890.to_csv(self.output_file_csv)
+        _df_after1890.to_csv(self.output_file_csv)
 
         # for every year
         for year in self.range_years:
 
             # extract every MP that was active in that year
             # (every MP of a year joined before the end of the and left after the beginning of the year)
-            df_year = df_after1890[pd.to_datetime(df_after1890['DateJoining']) <= datetime.datetime(year, 12, 31, 0, 0)]
-            df_year = df_year[pd.to_datetime(df_year['DateLeaving']) >= datetime.datetime(year, 1, 1, 0, 0)]
-            print(year, df_year.shape)
+            _df_year = _df_after1890[pd.to_datetime(_df_after1890['DateJoining']) <= datetime.datetime(year, 12, 31, 0, 0)]
+            _df_year = _df_year[pd.to_datetime(_df_year['DateLeaving']) >= datetime.datetime(year, 1, 1, 0, 0)]
+            print(year, _df_year.shape)
 
             # write df_year to a yearly csv file
         #    str_year = str(year)
@@ -203,8 +203,9 @@ class MPs_Extractor(object):
 
             # create a pandas dataframe from list of names
             # !!! list contains errors, see definition of function
-            list_lastnames = self.get_list_of_lastnames(df_year, df_after1890)
-            df_lastnames = pd.DataFrame(list_lastnames, columns = ('type', 'name_short', 'name_correct', 'uniqueIndex', 'CantonName', 'CantonAbbreviation', 'Citizenship', 'FirstName', 'DoubleName'))
+            _list_lastnames = self.get_list_of_lastnames(_df_year, _df_after1890)
+            df_lastnames = pd.DataFrame(_list_lastnames, columns = ('type', 'name_short', 'name_correct', 'uniqueIndex', 'CantonName', 'CantonAbbreviation', 'Citizenship', 'FirstName', 'additionalInfo'))
+            print(df_lastnames)
 
             # dump dictionary of last names to a pickle file
 #           path = pathlib.
-- 
GitLab