implemented council and date disambiguation

14eb9b77 · Lili Gasser · 321563de · 14eb9b77 · 14eb9b77 · 14eb9b77
Commit 14eb9b77 authored 6 years ago by Lili Gasser
--- a/data/lists/wrongly_identified_speakers.txt
+++ b/data/lists/wrongly_identified_speakers.txt
@@ -7,7 +7,7 @@ also check for council:

 one MP not active in whole year, leads to other not uniquely identified
 -----------------------------------------------------------------------
-1925/20029860ff: Keller-Aargau (in March, there is only one Keller-Aargau, in June, another one joins --> finds two!)
+1925/20029860ff: Keller-Aargau (in March, there is only one Keller-Aargau, in June, another one joins --> finds two!) --> solved!
 1925/20029967: Seiler (in December, the second Seiler already left) --> finds two!) --> solved!
 1925/20029967: Huber (in December, the second Huber already left) --> finds two!) --> solved because only NR!
 1925/20029882: Naine (in June, there is only one Naine, in December, another one joins --> finds two!) also in ...96, ...97, etc. --> solved!

--- a/src/python/run_extract_discussions.py
+++ b/src/python/run_extract_discussions.py
@@ -141,7 +141,6 @@ file_doc.get_council_date()
 #len(files_to_process)
 file_doc.check_discussion()

-str_date = '22.09.1925'
-
+str_date = '1925-12-09 08:00'
 import datetime
-datetime.datetime.strptime(str_date, '%d.%m.%Y')
+datetime.datetime.strptime(str_date, '%Y-%m-%d %H:%M')
--- a/src/python/utils_annot.py
+++ b/src/python/utils_annot.py
@@ -56,11 +56,11 @@ def get_council_and_date(path_meta_xml_file):
    # parse, get root and then part of interest
    XML_tree = ET.parse(path_meta_xml_file)
    XML_root = XML_tree.getroot()
-    XML_poi = XML_root[0].find('META_FROM_DB')
+    XML_poi = XML_root[0]

    # get council and date
-    str_council = XML_poi.attrib['RAT']
-    str_date = XML_poi.attrib['DATUM']
+    str_council = XML_poi.find('META_FROM_DB').attrib['RAT']
+    str_date = XML_poi.attrib['PUBLIKATIONS_DATUM']

    return (str_council, str_date)

@@ -570,7 +570,7 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str

        # cannot happen for the first term in list_oi
        elif name_type == 'canton':
-            list_cantonname, list_cantonabbr, list_citizenship, list_firstname, list_additionalInfo = get_list_cantons(df_names, str_name.split(' ')[0])
+            list_cantonname, list_cantonabbr, list_citizenship, list_firstname, list_additionalInfo = get_list_cantons(df_names, str_name.split(' ')[0], str_council)
            canton_type = ''
            if term in list_cantonname:
                str_canton = term
@@ -608,10 +608,11 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str
                # get rid of CANTON MISSING
                str_name = str_name.split(' ')[0]

+                df_temp = get_df_temp_canton(df_names, str_name, str_council)
                # extract uniqueID
-                # if Citizenship, do proper comparison
+                # if Citizenship, get list of cities and compare each to term
                if canton_type == 'Citizenship':
-                    df_temp = df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name)]
+                    # get list of cities
                    list_cities = [entry for entry in df_temp[canton_type] if str_canton in get_cities([entry])]
                    str_citizenship = ''
                    try:
@@ -621,12 +622,12 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str
                        print('found no or more than one person with citizenship', str_canton, str_name)
                        pass

-                    list_temp = list(df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name) & (df_names[canton_type]==str_citizenship)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
-                    str_completeName = df_names['completeName'].loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name) & (df_names[canton_type]==str_citizenship)].iloc[0]
+                    list_temp = list(df_temp.loc[(df_temp['nameType']==name_type) & (df_temp['shortName']==str_name) & (df_temp[canton_type]==str_citizenship)].iloc[:, df_temp.columns.get_loc('uniqueIndex')])
+                    str_completeName = df_temp['completeName'].loc[(df_temp['nameType']==name_type) & (df_temp['shortName']==str_name) & (df_temp[canton_type]==str_citizenship)].iloc[0]

                else:
-                    list_temp = list(df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name) & (df_names[canton_type]==str_canton)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
-                    str_completeName = df_names['completeName'].loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name) & (df_names[canton_type]==str_canton)].iloc[0]
+                    list_temp = list(df_temp.loc[(df_temp['nameType']==name_type) & (df_temp['shortName']==str_name) & (df_temp[canton_type]==str_canton)].iloc[:, df_temp.columns.get_loc('uniqueIndex')])
+                    str_completeName = df_temp['completeName'].loc[(df_temp['nameType']==name_type) & (df_temp['shortName']==str_name) & (df_temp[canton_type]==str_canton)].iloc[0]

                print(list_temp, list_uniqueID)

@@ -662,10 +663,12 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str
            # initial checks for not uniquely identified peoples
            # TODO check for false positives of these procedures
            if name_type == 'canton':
-                df_temp = df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name.split(' ')[0])]
                # check if person can be identified from council
-                list_councils = list(df_temp['CouncilName'])
-                if list_councils.count(str_council) == 1:
+                df_temp = df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name.split(' ')[0]) & (df_names['CouncilName']==str_council)]
+                if df_temp.shape[0] == 1:
+                ## check if person can be identified from council
+                #list_councils = list(df_temp['CouncilName'])
+                #if list_councils.count(str_council) == 1:
                    list_temp = list(df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name.split(' ')[0]) & (df_names['CouncilName']==str_council)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
                    str_completeName = df_names['completeName'].loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name.split(' ')[0]) & (df_names['CouncilName']==str_council)].iloc[0]

@@ -675,36 +678,35 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str
                    else:
                        str_name = add_to_string(str_name, str_completeName)

-                # check if person can be identified from date of discussion
-                # TODO: is input dataformat always the same?
-                df_temp_before = df_temp[pd.to_datetime(df_temp['DateJoining']) <= datetime.datetime.strptime(str_date, '%d.%m.%Y')]
-                # TODO: replace by (or add another condition) if df_temp_before.shape[0] < df_temp.shape[0]
-                if df_temp_before.shape[0] == 1:
-                    list_temp = list(df_temp_before['uniqueIndex'])
-                    str_completeName = df_temp_before['completeName'].iloc[0]
-
-                    list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp, name_type)
-                    if str_completeName.split(' ')[0] == str_name.split(' ')[0]:
-                        str_name = add_to_string('', str_completeName)
-                    else:
-                        str_name = add_to_string(str_name, str_completeName)
-
-                df_temp_after = df_temp[pd.to_datetime(df_temp['DateLeaving']) >= datetime.datetime.strptime(str_date, '%d.%m.%Y')]
-                # TODO: replace by (or add another condition) if df_temp_before.shape[0] < df_temp.shape[0]
-                if df_temp_after.shape[0] == 1:
-                    list_temp = list(df_temp_after['uniqueIndex'])
-                    str_completeName = df_temp_after['completeName'].iloc[0]
-
-                    list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp, name_type)
-                    if str_completeName.split(' ')[0] == str_name.split(' ')[0]:
-                        str_name = add_to_string('', str_completeName)
-                    else:
-                        str_name = add_to_string(str_name, str_completeName)
-
-                print(str_date, df_temp_before.shape, df_temp_after.shape)
+                else:
+                    # check if person can be identified from date of discussion
+                    # exclude people that joined after date of discussion
+                    df_temp_before = df_temp[pd.to_datetime(df_temp['DateJoining']) <= datetime.datetime.strptime(str_date, '%Y-%m-%d %H:%M')]
+                    if df_temp_before.shape[0] == 1:
+                        list_temp = list(df_temp_before['uniqueIndex'])
+                        str_completeName = df_temp_before['completeName'].iloc[0]
+
+                        list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp, name_type)
+                        if str_completeName.split(' ')[0] == str_name.split(' ')[0]:
+                            str_name = add_to_string('', str_completeName)
+                        else:
+                            str_name = add_to_string(str_name, str_completeName)
+
+                    # exclude people that left before date of discussion
+                    df_temp_after = df_temp[pd.to_datetime(df_temp['DateLeaving']) >= datetime.datetime.strptime(str_date, '%Y-%m-%d %H:%M')]
+                    if df_temp_after.shape[0] == 1:
+                        list_temp = list(df_temp_after['uniqueIndex'])
+                        str_completeName = df_temp_after['completeName'].iloc[0]
+
+                        list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp, name_type)
+                        if str_completeName.split(' ')[0] == str_name.split(' ')[0]:
+                            str_name = add_to_string('', str_completeName)
+                        else:
+                            str_name = add_to_string(str_name, str_completeName)
+
+                    print(str_date, df_temp.shape, df_temp_before.shape, df_temp_after.shape)


-                # TODO: does this order make sense? council before date??
                # TODO: function to update list unique ID and str_name


@@ -783,13 +785,20 @@ def label_language(XML_new, ind_p, ind_t, aux_dict_l):
 def get_cities(list_citizenship):
    return [city[:-5] for item in list_citizenship for city in item.split(',')]

-# function to get list of places
-def get_list_cantons(df_names, str_name = ''):
-    if str_name:
-        df_temp = df_names.loc[(df_names['nameType']=='canton') & (df_names['shortName']==str_name)]
+def get_df_temp_canton(df_names, str_name, str_council):
+
+    if str_council in ['Nationalrat', 'Ständerat']:
+        df_temp = df_names.loc[(df_names['nameType']=='canton') & (df_names['shortName']==str_name) & (df_names['CouncilName']==str_council)]
    else:
-        df_temp = df_names.loc[df_names['nameType']=='canton']
-    #print(df_temp)
+        df_temp = df_names.loc[(df_names['nameType']=='canton') & (df_names['shortName']==str_name)]
+
+    return df_temp
+
+# function to get list of places
+def get_list_cantons(df_names, str_name, str_council = ''):
+
+    df_temp = get_df_temp_canton(df_names, str_name, str_council)
+
    # list of cantons
    list_cantonname = list(df_temp['CantonName'])
    # TODO this will lead to an error!