Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
D
democrasci_preprocWP1
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Marta Balode
democrasci_preprocWP1
Commits
3a8ce1a1
Commit
3a8ce1a1
authored
6 years ago
by
Lili Gasser
Browse files
Options
Downloads
Patches
Plain Diff
WIP: name disambiguation doubled double names
parent
8cbe4c86
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/python/utils_annot.py
+63
-80
63 additions, 80 deletions
src/python/utils_annot.py
with
63 additions
and
80 deletions
src/python/utils_annot.py
+
63
−
80
View file @
3a8ce1a1
...
...
@@ -274,9 +274,6 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
if
bln_print
:
print
(
'
name
'
,
str_name
,
'
role
'
,
str_role
)
# get rid of doubled double names
# TODO
# get rid of 'Präsident stimmt nicht Président ne vote pas'
if
set
(
str_role
.
split
()).
intersection
(
set
([
'
Präsident
'
,
'
Präsidentin
'
,
'
Président
'
,
'
Présidente
'
]))
and
not
str_name
:
if
set
([
'
stimmt
'
,
'
nicht
'
,
'
vote
'
,
'
pas
'
]).
intersection
(
list_oi
):
...
...
@@ -423,94 +420,58 @@ def flatten(l):
# - str_name: string to which name should be attached
# - list_uniqueID: list with one or several uniqueIDs
# - list_tupels: list of tupels containing all types of names
# TODO: double names in extra column: string gets written twice, uniqueID is fine (1893, 20026532, Cramer)
def
find_names
(
list_oi
,
list_roles
,
list_roles_ext
,
df_names
,
list_notnames
,
bln_print
=
False
):
def
get_string
(
term
,
df_names
,
str_name
,
list_uniqueID
):
# get name type
name_type
=
df_names
[
'
nameType
'
].
loc
[
df_names
[
'
shortName
'
]
==
term
].
iloc
[
0
]
print
(
df_names
[
df_names
[
'
shortName
'
]
==
term
])
print
(
term
)
print
(
name_type
)
if
name_type
!=
'
simple
'
:
print
(
df_names
[
df_names
[
'
shortName
'
]
==
term
]
)
print
(
term
,
name_type
)
# extract uniqueID
# extract uniqueID
and complete name for this term
list_temp
=
[]
# TODO might lead to doubled double names
if
name_type
in
[
'
simple
'
,
'
double
'
,
'
comp
'
]:
list_temp
=
[
df_names
.
loc
[(
df_names
[
'
nameType
'
]
==
name_type
)
&
(
df_names
[
'
shortName
'
]
==
term
)].
iat
[
0
,
df_names
.
columns
.
get_loc
(
'
uniqueIndex
'
)]]
str_completeName
=
df_names
[
'
completeName
'
].
loc
[
df_names
[
'
shortName
'
]
==
term
].
iloc
[
0
]
str_name
=
add_to_string
(
str_name
,
str_completeName
)
# TODO: how to handle for people mentioned in text???
elif
name_type
in
[
'
canton
'
]:
list_temp
=
list
(
df_names
.
loc
[(
df_names
[
'
nameType
'
]
==
name_type
)
&
(
df_names
[
'
shortName
'
]
==
term
)].
iloc
[:,
df_names
.
columns
.
get_loc
(
'
uniqueIndex
'
)])
str_name
=
add_to_string
(
str_name
,
term
+
'
(CANTON MISSING)
'
)
print
(
list_temp
)
print
(
str_name
)
## if it is one of the simple names
#if term in list(df_names['shortName'].loc[df_names['nameType']=='simple']):
#str_name = add_to_string(str_name, term)
#name_type = 'simple'
## if it is a double name
#elif term in list(df_names['shortName'].loc[df_names['nameType']=='double']):
#if bln_print:
#print(5*'\n', 'DOUBLE NAME')
## get correct name
#correct_name = df_names.loc[(df_names['nameType']=='double') & (df_names['shortName']== term)].iat[0, df_names.columns.get_loc('name_correct')]
#if bln_print:
#print('double name', correct_name)
## only add name if it is not there yet
## if a person is referenced by its complete double name, e.g. Meier-Müller, he or she gets two entries
#if correct_name not in str_name.split(' '):
#str_name = add_to_string(str_name, correct_name)
#name_type = 'double'
## if it is a composite name
#elif term in list(df_names['shortName'].loc[df_names['nameType']=='comp']):
## get correct name
#correct_name = df_names.loc[(df_names['nameType']=='comp') & (df_names['shortName']== term)].iat[0, df_names.columns.get_loc('name_correct')]
#if bln_print:
#print('composite name', correct_name)
#str_name = add_to_string(str_name, correct_name)
#name_type = 'comp'
## if it contains a canton
## TODO: how to handle for people mentioned in text???
#elif term in list(df_names['shortName'].loc[df_names['nameType']=='canton']):
#if bln_print:
#print('contains a canton', term)
#
#str_name = add_to_string(str_name, term + ' (CANTON MISSING)')
#name_type = 'canton'
#
#
## extract uniqueID
#list_temp = []
#if name_type in ['simple', 'double', 'comp']:
#list_temp = [df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==term)].iat[0, df_names.columns.get_loc('uniqueIndex')]]
#elif name_type in ['canton']:
#list_temp = list(df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==term)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
if
len
(
list_temp
)
>
0
:
if
bln_print
:
print
(
list_temp
,
list_uniqueID
)
print
(
type
(
list_temp
),
type
(
list_uniqueID
))
str_completeName
=
term
+
'
(CANTON MISSING)
'
print
(
list_temp
,
str_completeName
)
# set or update unique ID and name
# if no unique ID and name has been assigned so far
if
len
(
list_uniqueID
)
==
0
and
str_name
==
''
:
list_uniqueID
=
list_temp
str_name
=
add_to_string
(
str_name
,
str_completeName
)
list_uniqueID
=
update_list_uniqueID
(
list_uniqueID
,
list_temp
,
name_ty
pe
)
# if there are already one or several peo
pe
else
:
print
(
'
is this even possible??
'
)
# if it is a double name,
if
name_type
==
'
double
'
:
if
list_uniqueID
==
list_temp
:
# do nothing if person has already been found
pass
else
:
# check whether we found a person with the same first part of the double lastname
# and overwrite if this is the case
# e.g. if we found a Meyer before we found a Meyer-Boller, e.g. 1971/20000010
if
str_completeName
.
split
(
'
-
'
)[
0
]
==
str_name
.
split
(
'
'
)[
0
]:
list_uniqueID
=
list_temp
str_name
=
add_to_string
(
''
,
str_completeName
)
# if we have a new person, we append
elif
len
(
set
(
list_temp
).
intersection
(
set
(
flatten
(
list_uniqueID
))))
!=
0
:
list_uniqueID
.
append
(
list_temp
)
str_name
=
add_to_string
(
str_name
,
str_completeName
)
return
str_name
,
list_uniqueID
,
name_type
def
update_list_uniqueID
(
list_uniqueID
,
list_temp
,
name_type
):
# if no unique ID has been assigned so far
if
len
(
list_uniqueID
)
==
0
:
list_uniqueID
=
list_temp
# if there are already one or several people and have a new person, we update
elif
len
(
list_uniqueID
)
>
0
and
len
(
set
(
list_temp
).
intersection
(
set
(
flatten
(
list_uniqueID
))))
==
0
:
list_uniqueID
.
append
(
list_temp
)
# if name_type is canton
# if name_type is canton, we override other entries by correct one
if
name_type
==
'
canton
'
and
len
(
list_temp
)
==
1
and
list_temp
[
0
]
in
list_uniqueID
:
list_uniqueID
=
list_temp
...
...
@@ -595,7 +556,7 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, bln
# cannot happen for the first term in list_oi
elif
name_type
==
'
canton
'
:
list_cantonname
,
list_cantonabbr
,
list_citizenship
,
list_firstname
=
get_list_cantons
(
df_names
,
str_name
.
split
(
'
'
)[
0
])
list_cantonname
,
list_cantonabbr
,
list_citizenship
,
list_firstname
,
list_additionalInfo
=
get_list_cantons
(
df_names
,
str_name
.
split
(
'
'
)[
0
])
canton_type
=
''
if
term
in
list_cantonname
:
str_canton
=
term
...
...
@@ -613,14 +574,26 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, bln
str_canton
=
term
canton_type
=
'
FirstName
'
print
(
'
!!! is a canton
'
,
term
,
list_oi
,
str_name
,
str_role
)
elif
term
in
list_additionalInfo
:
str_canton
=
term
canton_type
=
'
additionalInfo
'
print
(
'
!!! is a canton
'
,
term
,
list_oi
,
str_name
,
str_role
)
else
:
print
(
'
might be a canton
'
,
term
,
list_oi
,
str_name
,
str_role
)
# look for similar names based on (normalized) Damerau-Levenshtein distance
# TODO: might needs to be extended for other than cantonname
term_approx
=
get_approximate_term
(
term
,
np
.
array
(
list_cantonname
))
if
term_approx
:
str_canton
=
term_approx
canton_type
=
'
CantonName
'
print
(
'
might be a canton:
'
,
term
,
list_oi
,
str_name
,
str_role
,
term_approx
)
# if a canton or similar was found
if
canton_type
:
# get rid of CANTON MISSING
str_name
=
str_name
.
split
(
'
'
)[
0
]
# extract uniqueID
# if Citizenship, do proper comparison
if
canton_type
==
'
Citizenship
'
:
...
...
@@ -636,16 +609,19 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, bln
list_temp
=
list
(
df_names
.
loc
[(
df_names
[
'
nameType
'
]
==
name_type
)
&
(
df_names
[
'
shortName
'
]
==
str_name
)
&
(
df_names
[
canton_type
]
==
str_citizenship
)].
iloc
[:,
df_names
.
columns
.
get_loc
(
'
uniqueIndex
'
)])
str_completeName
=
df_names
[
'
completeName
'
].
loc
[(
df_names
[
'
nameType
'
]
==
name_type
)
&
(
df_names
[
'
shortName
'
]
==
str_name
)
&
(
df_names
[
canton_type
]
==
str_citizenship
)].
iloc
[
0
]
str_name
=
add_to_string
(
str_name
,
str_completeName
)
else
:
list_temp
=
list
(
df_names
.
loc
[(
df_names
[
'
nameType
'
]
==
name_type
)
&
(
df_names
[
'
shortName
'
]
==
str_name
)
&
(
df_names
[
canton_type
]
==
str_canton
)].
iloc
[:,
df_names
.
columns
.
get_loc
(
'
uniqueIndex
'
)])
str_completeName
=
df_names
[
'
completeName
'
].
loc
[(
df_names
[
'
nameType
'
]
==
name_type
)
&
(
df_names
[
'
shortName
'
]
==
str_name
)
&
(
df_names
[
canton_type
]
==
str_canton
)].
iloc
[
0
]
str_name
=
add_to_string
(
str_name
,
str_completeName
)
print
(
list_temp
,
list_uniqueID
)
if
len
(
list_temp
)
>
0
:
list_uniqueID
=
update_list_uniqueID
(
list_uniqueID
,
list_temp
,
name_type
)
if
str_completeName
.
split
(
'
'
)[
0
]
==
str_name
:
str_name
=
add_to_string
(
''
,
str_completeName
)
else
:
str_name
=
add_to_string
(
str_name
,
str_completeName
)
# if term is not easily mistaken as a name (avoid false positives)
elif
term
not
in
list_notnames
:
...
...
@@ -750,20 +726,27 @@ def get_list_cantons(df_names, str_name = ''):
else
:
df_temp
=
df_names
.
loc
[
df_names
[
'
nameType
'
]
==
'
canton
'
]
#print(df_temp)
# list of cantons
list_cantonname
=
list
(
df_temp
[
'
CantonName
'
])
# TODO this will lead to an error!
for
canton
in
[
'
Basel-Stadt
'
,
'
Basel-Landschaft
'
]:
if
canton
in
list_cantonname
:
list_cantonname
.
extend
([
'
Basel
'
])
if
'
Graubünden
'
in
list_cantonname
:
list_cantonname
.
extend
([
'
Bünden
'
])
if
'
Bern
'
in
list_cantonname
:
# check how this works!!
list_cantonname
.
extend
([
'
Berne
'
])
# list of canton abbreviations
list_cantonabbr
=
list
(
df_temp
[
'
CantonAbbreviation
'
])
# list of citizenships
list_citizenship
=
list
(
df_temp
[
'
Citizenship
'
])
list_citizenship
=
get_cities
(
list_citizenship
)
# list of first names
list_firstname
=
list
(
df_temp
[
'
FirstName
'
])
return
list_cantonname
,
list_cantonabbr
,
list_citizenship
,
list_firstname
# list of additional information
list_additionalInfo
=
list
(
df_temp
[
'
additionalInfo
'
])
return
list_cantonname
,
list_cantonabbr
,
list_citizenship
,
list_firstname
,
list_additionalInfo
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment