Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
D
democrasci_preprocWP1
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Wiki
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Marta Balode
democrasci_preprocWP1
Commits
8473096a
Commit
8473096a
authored
6 years ago
by
Lili Gasser
Browse files
Options
Downloads
Patches
Plain Diff
clean utils_annot
parent
135db555
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/python/utils_annot.py
+39
-57
39 additions, 57 deletions
src/python/utils_annot.py
with
39 additions
and
57 deletions
src/python/utils_annot.py
+
39
−
57
View file @
8473096a
...
...
@@ -224,6 +224,14 @@ def get_complete_text(textbox):
# - (str_name, str_role, list_uniqueID, str_canton): tuple with strings and ID
def
label_speechstart
(
XML_new
,
ind_p
,
ind_t
,
text
,
ind_tl_colon
,
df_names
,
list_stopwords
,
list_notnames
,
bln_print
=
False
):
# lists of roles
list_roles
=
[
'
Präsident
'
,
'
Präsidentin
'
,
'
Vizepräsident
'
,
'
Präsidium
'
,
'
Président
'
,
'
Présidente
'
,
'
président
'
,
'
présidente
'
,
'
Berichterstatter
'
,
'
Berichterstatterin
'
,
'
rapporteur
'
,
'
Sprecher
'
,
'
Sprecherin
'
,
'
porte-parole
'
,
'
porteparole
'
,
'
Bundesrat
'
,
'
Bundesrath
'
,
'
Bundesrätin
'
,
'
conseiller fédéral
'
,
'
Vizepräsident
'
]
list_roles_ext
=
[
'
Mehrheit
'
,
'
Minderheit
'
,
'
majorité
'
,
'
minorité
'
,
'
deutscher
'
,
'
deutsche
'
,
'
français
'
,
'
française
'
,
'
Kommission
'
,
'
commission
'
]
# initialize flag
this_is_speech
=
False
...
...
@@ -234,7 +242,6 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
# at the beginning of a textbox and identifiying a name or a role in front
# of that colon
if
ind_tl_colon
>=
0
:
# if ':' in text[:100]:
# extract the index of the colon in the text
colon_index_text
=
text
.
index
(
'
:
'
)
...
...
@@ -259,12 +266,11 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
# remove single characters
# TODO: might need to be changed for fractions (some fractions are abbreviated as single letters)
# TODO: maybe exclude I and A to account for Appenzell
list_oi
=
[
term
for
term
in
list_oi
if
len
(
term
)
>
1
]
# # for every term
# for term in list_oi:
# if possible, find a name in a list
str_name
,
str_role
,
list_uniqueID
,
str_canton
=
find_names
(
list_oi
,
df_names
,
list_notnames
,
bln_print
=
False
)
# if possible, find a name from the list
str_name
,
str_role
,
list_uniqueID
,
str_canton
=
find_names
(
list_oi
,
list_roles
,
list_roles_ext
,
df_names
,
list_notnames
,
bln_print
=
False
)
if
bln_print
:
print
(
'
name
'
,
str_name
,
'
role
'
,
str_role
)
...
...
@@ -300,14 +306,11 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
thattext
=
XML_new
[
ind_p
][
ind_t
][
0
].
text
colon_index
=
thattext
.
index
(
'
:
'
)
# print(thattext)
try
:
# write speaker to first line
XML_new
[
ind_p
][
ind_t
][
0
].
text
=
thattext
[:
colon_index
+
1
]
+
fontend
# get start of speech with correct font start
# print(thattext[colon_index+1:])
if
thattext
[
colon_index
+
1
:].
startswith
(
'
[font
'
):
startspeech
=
thattext
[
colon_index
+
1
:]
elif
re
.
match
(
'
^[ ]?\[/font\]$
'
,
thattext
[
colon_index
+
1
:]):
...
...
@@ -317,8 +320,6 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
else
:
startspeech
=
thattext
[
colon_index
+
1
:]
# print(startspeech)
# write beginning of speech to second line
# (create new ET element if necessary)
if
len
(
list
(
XML_new
[
ind_p
][
ind_t
]))
>
1
:
...
...
@@ -343,7 +344,6 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
colon_index
=
thattext
.
index
(
'
:
'
)
# get start of speech with correct font start
# print(thattext[colon_index+1:])
if
thattext
[
colon_index
+
1
:].
startswith
(
'
[font
'
):
startspeech
=
thattext
[
colon_index
+
1
:]
elif
re
.
match
(
'
^[ ]?\[/font\]$
'
,
thattext
[
colon_index
+
1
:]):
...
...
@@ -424,9 +424,9 @@ def flatten(l):
# - list_uniqueID: list with one or several uniqueIDs
# - list_tupels: list of tupels containing all types of names
# TODO: double names in extra column: string gets written twice, uniqueID is fine (1893, 20026532, Cramer)
def
find_names
(
list_oi
,
df_names
,
list_notnames
,
bln_print
=
False
):
def
find_names
(
list_oi
,
list_roles
,
list_roles_ext
,
df_names
,
list_notnames
,
bln_print
=
False
):
def
get_string
(
term
,
str_name
,
str_role
,
list_uniqueID
,
str_canton
):
def
get_string
(
term
,
df_names
,
str_name
,
str_role
,
list_uniqueID
,
str_canton
):
name_type
=
''
# if it is one of the simple names
if
term
in
list
(
df_names
[
'
name_short
'
].
loc
[
df_names
[
'
type
'
]
==
'
simple
'
]):
...
...
@@ -457,8 +457,7 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
str_name
=
add_to_string
(
str_name
,
correct_name
)
name_type
=
'
comp
'
# if it contains a canton
# !!! also pass list_oi to look for canton
# !!! how to handle for people mentioned in text???
# TODO: how to handle for people mentioned in text???
elif
term
in
list
(
df_names
[
'
name_short
'
].
loc
[
df_names
[
'
type
'
]
==
'
canton
'
]):
if
bln_print
:
print
(
'
contains a canton
'
,
term
)
...
...
@@ -480,46 +479,35 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
print
(
list_temp
,
list_uniqueID
)
print
(
type
(
list_temp
),
type
(
list_uniqueID
))
print
(
isinstance
(
list_uniqueID
,
list
))
# if no unique ID has been assigned so far
if
len
(
list_uniqueID
)
==
0
:
list_uniqueID
=
list_temp
# if there are already one or several people and have a new person, we update
elif
len
(
list_uniqueID
)
>
0
and
len
(
set
(
list_temp
).
intersection
(
set
(
flatten
(
list_uniqueID
))))
==
0
:
list_uniqueID
.
append
(
list_temp
)
## if we already have several possible people, e.g. because of canton
#elif isinstance(int_uniqueID, tuple):
#print('I should be here')
## and refound the uniqueID of one of those, don't update
#if temp in int_uniqueID:
#pass
## and update if we don't have that uniqueID yet
#else:
#int_uniqueID = (int_uniqueID, temp)
## if a person with that uniqueID exists already, don't update
#elif isinstance(int(int_uniqueID), int) and temp == int_uniqueID:
#print('but end up here.. not even.....')
#pass
## if a different unique ID has been assigned already
#else:
#int_uniqueID = (int_uniqueID, temp)
list_uniqueID
=
update_list_uniqueID
(
list_uniqueID
,
list_temp
)
return
str_name
,
str_role
,
list_uniqueID
,
name_type
def
update_list_uniqueID
(
list_uniqueID
,
list_temp
):
# if no unique ID has been assigned so far
if
len
(
list_uniqueID
)
==
0
:
list_uniqueID
=
list_temp
# if there are already one or several people and have a new person, we update
elif
len
(
list_uniqueID
)
>
0
and
len
(
set
(
list_temp
).
intersection
(
set
(
flatten
(
list_uniqueID
))))
==
0
:
list_uniqueID
.
append
(
list_temp
)
return
list_uniqueID
# function to find correct term (in case of misspellings, etc.)
def
get_approximate_term
(
term
,
array_all
_names
):
def
get_approximate_term
(
term
,
array_all
):
# TODO: probably need to improve this procedure
# - find better values ....
# initialize string
term_approx
=
''
# get normalize array
array_normalized
=
array_all
_names
[
normalized_damerau_levenshtein_distance_ndarray
(
term
,
array_all
_names
)
<=
0.35
]
# get normalize
d
array
array_normalized
=
array_all
[
normalized_damerau_levenshtein_distance_ndarray
(
term
,
array_all
)
<=
0.35
]
array_normalized_values
=
normalized_damerau_levenshtein_distance_ndarray
(
term
,
array_normalized
)
# get absolute array
array_absolute
=
array_all
_names
[
damerau_levenshtein_distance_ndarray
(
term
,
array_all
_names
)
<=
2
]
array_absolute
=
array_all
[
damerau_levenshtein_distance_ndarray
(
term
,
array_all
)
<=
2
]
array_absolute_values
=
damerau_levenshtein_distance_ndarray
(
term
,
array_absolute
)
if
bln_print
:
print
(
term
)
...
...
@@ -560,14 +548,6 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
str_canton
=
''
name_type
=
''
# lists of roles
list_roles
=
[
'
Präsident
'
,
'
Präsidentin
'
,
'
Vizepräsident
'
,
'
Präsidium
'
,
'
Président
'
,
'
Présidente
'
,
'
président
'
,
'
présidente
'
,
'
Berichterstatter
'
,
'
Berichterstatterin
'
,
'
rapporteur
'
,
'
Sprecher
'
,
'
Sprecherin
'
,
'
porte-parole
'
,
'
porteparole
'
,
'
Bundesrat
'
,
'
Bundesrath
'
,
'
Bundesrätin
'
,
'
conseiller fédéral
'
,
'
Vizepräsident
'
]
list_roles_ext
=
[
'
Mehrheit
'
,
'
Minderheit
'
,
'
majorité
'
,
'
minorité
'
,
'
deutscher
'
,
'
deutsche
'
,
'
français
'
,
'
française
'
,
'
Kommission
'
,
'
commission
'
]
# extract list and array of last names
list_all_names
=
list
(
df_names
[
'
name_short
'
])
array_all_names
=
np
.
array
(
df_names
[
'
name_short
'
])
...
...
@@ -580,18 +560,19 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
if
term
in
list_roles
:
# get correct name and uniqueID, or role, for that term
str_name
,
str_role
,
list_uniqueID
,
name_type
=
get_string
(
term
,
str_name
,
str_role
,
list_uniqueID
,
str_canton
)
str_name
,
str_role
,
list_uniqueID
,
name_type
=
get_string
(
term
,
df_names
,
str_name
,
str_role
,
list_uniqueID
,
str_canton
)
if
bln_print
:
print
(
'
found a role
'
,
term
)
# TODO: also look for similar terms (misspellings)
# TODO: assign role in English, e.g. Berichterstatter and rapporteur become reporter
elif
term
in
list_roles_ext
:
pass
# TODO: extract whether it is minority or majority and save that information
# can
not happen for the first term
# cannot happen for the first term
elif
name_type
==
'
canton
'
:
list_cantonname
,
list_cantonabbr
,
list_citizenship
,
list_firstname
=
get_list_cantons
(
df_names
,
str_name
.
split
(
'
'
)[
0
])
canton_type
=
''
...
...
@@ -614,7 +595,6 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
else
:
print
(
'
might be a canton
'
,
term
,
list_oi
,
str_name
,
str_role
)
# TODO: maybe: go to next elif?
# if a canton or similar was found
if
canton_type
:
...
...
@@ -630,7 +610,7 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
try
:
if
len
(
list_cities
)
==
1
:
str_citizenship
=
list_cities
[
0
]
#
except:
except
:
print
(
'
found no or more than one person with citizenship
'
,
str_canton
,
str_name
)
pass
...
...
@@ -639,7 +619,9 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
else
:
list_temp
=
list
(
df_names
.
loc
[(
df_names
[
'
type
'
]
==
name_type
)
&
(
df_names
[
'
name_short
'
]
==
str_name
)
&
(
df_names
[
canton_type
]
==
str_canton
)].
iloc
[:,
df_names
.
columns
.
get_loc
(
'
uniqueIndex
'
)])
print
(
list_temp
,
list_uniqueID
)
list_uniqueID
=
list_temp
if
len
(
list_temp
)
>
0
:
list_uniqueID
=
update_list_uniqueID
(
list_uniqueID
,
list_temp
)
# if term is not easily mistaken as a name (avoid false positives)
elif
term
not
in
list_notnames
:
...
...
@@ -647,7 +629,7 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
# if term is in the list of all names
if
term
in
list_all_names
:
# get correct name and uniqueID, or role, for that term
str_name
,
str_role
,
list_uniqueID
,
name_type
=
get_string
(
term
,
str_name
,
str_role
,
list_uniqueID
,
str_canton
)
str_name
,
str_role
,
list_uniqueID
,
name_type
=
get_string
(
term
,
df_names
,
str_name
,
str_role
,
list_uniqueID
,
str_canton
)
if
bln_print
:
print
(
'
=== correct name
'
,
term
)
...
...
@@ -659,7 +641,7 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
# if one was found, get correct name, etc.
if
term_approx
:
str_name
,
str_role
,
list_uniqueID
,
name_type
=
get_string
(
term_approx
,
str_name
,
str_role
,
list_uniqueID
,
str_canton
)
str_name
,
str_role
,
list_uniqueID
,
name_type
=
get_string
(
term_approx
,
df_names
,
str_name
,
str_role
,
list_uniqueID
,
str_canton
)
if
bln_print
:
print
(
'
=== approximate name
'
,
str_name
,
term_approx
)
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment