Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
D
democrasci_preprocWP1
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Wiki
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Marta Balode
democrasci_preprocWP1
Commits
8473096a
Commit
8473096a
authored
6 years ago
by
Lili Gasser
Browse files
Options
Downloads
Patches
Plain Diff
clean utils_annot
parent
135db555
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/python/utils_annot.py
+39
-57
39 additions, 57 deletions
src/python/utils_annot.py
with
39 additions
and
57 deletions
src/python/utils_annot.py
+
39
−
57
View file @
8473096a
...
@@ -224,6 +224,14 @@ def get_complete_text(textbox):
...
@@ -224,6 +224,14 @@ def get_complete_text(textbox):
# - (str_name, str_role, list_uniqueID, str_canton): tuple with strings and ID
# - (str_name, str_role, list_uniqueID, str_canton): tuple with strings and ID
def
label_speechstart
(
XML_new
,
ind_p
,
ind_t
,
text
,
ind_tl_colon
,
df_names
,
list_stopwords
,
list_notnames
,
bln_print
=
False
):
def
label_speechstart
(
XML_new
,
ind_p
,
ind_t
,
text
,
ind_tl_colon
,
df_names
,
list_stopwords
,
list_notnames
,
bln_print
=
False
):
# lists of roles
list_roles
=
[
'
Präsident
'
,
'
Präsidentin
'
,
'
Vizepräsident
'
,
'
Präsidium
'
,
'
Président
'
,
'
Présidente
'
,
'
président
'
,
'
présidente
'
,
'
Berichterstatter
'
,
'
Berichterstatterin
'
,
'
rapporteur
'
,
'
Sprecher
'
,
'
Sprecherin
'
,
'
porte-parole
'
,
'
porteparole
'
,
'
Bundesrat
'
,
'
Bundesrath
'
,
'
Bundesrätin
'
,
'
conseiller fédéral
'
,
'
Vizepräsident
'
]
list_roles_ext
=
[
'
Mehrheit
'
,
'
Minderheit
'
,
'
majorité
'
,
'
minorité
'
,
'
deutscher
'
,
'
deutsche
'
,
'
français
'
,
'
française
'
,
'
Kommission
'
,
'
commission
'
]
# initialize flag
# initialize flag
this_is_speech
=
False
this_is_speech
=
False
...
@@ -234,7 +242,6 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
...
@@ -234,7 +242,6 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
# at the beginning of a textbox and identifiying a name or a role in front
# at the beginning of a textbox and identifiying a name or a role in front
# of that colon
# of that colon
if
ind_tl_colon
>=
0
:
if
ind_tl_colon
>=
0
:
# if ':' in text[:100]:
# extract the index of the colon in the text
# extract the index of the colon in the text
colon_index_text
=
text
.
index
(
'
:
'
)
colon_index_text
=
text
.
index
(
'
:
'
)
...
@@ -259,12 +266,11 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
...
@@ -259,12 +266,11 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
# remove single characters
# remove single characters
# TODO: might need to be changed for fractions (some fractions are abbreviated as single letters)
# TODO: might need to be changed for fractions (some fractions are abbreviated as single letters)
# TODO: maybe exclude I and A to account for Appenzell
list_oi
=
[
term
for
term
in
list_oi
if
len
(
term
)
>
1
]
list_oi
=
[
term
for
term
in
list_oi
if
len
(
term
)
>
1
]
# # for every term
# if possible, find a name from the list
# for term in list_oi:
str_name
,
str_role
,
list_uniqueID
,
str_canton
=
find_names
(
list_oi
,
list_roles
,
list_roles_ext
,
df_names
,
list_notnames
,
bln_print
=
False
)
# if possible, find a name in a list
str_name
,
str_role
,
list_uniqueID
,
str_canton
=
find_names
(
list_oi
,
df_names
,
list_notnames
,
bln_print
=
False
)
if
bln_print
:
if
bln_print
:
print
(
'
name
'
,
str_name
,
'
role
'
,
str_role
)
print
(
'
name
'
,
str_name
,
'
role
'
,
str_role
)
...
@@ -300,14 +306,11 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
...
@@ -300,14 +306,11 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
thattext
=
XML_new
[
ind_p
][
ind_t
][
0
].
text
thattext
=
XML_new
[
ind_p
][
ind_t
][
0
].
text
colon_index
=
thattext
.
index
(
'
:
'
)
colon_index
=
thattext
.
index
(
'
:
'
)
# print(thattext)
try
:
try
:
# write speaker to first line
# write speaker to first line
XML_new
[
ind_p
][
ind_t
][
0
].
text
=
thattext
[:
colon_index
+
1
]
+
fontend
XML_new
[
ind_p
][
ind_t
][
0
].
text
=
thattext
[:
colon_index
+
1
]
+
fontend
# get start of speech with correct font start
# get start of speech with correct font start
# print(thattext[colon_index+1:])
if
thattext
[
colon_index
+
1
:].
startswith
(
'
[font
'
):
if
thattext
[
colon_index
+
1
:].
startswith
(
'
[font
'
):
startspeech
=
thattext
[
colon_index
+
1
:]
startspeech
=
thattext
[
colon_index
+
1
:]
elif
re
.
match
(
'
^[ ]?\[/font\]$
'
,
thattext
[
colon_index
+
1
:]):
elif
re
.
match
(
'
^[ ]?\[/font\]$
'
,
thattext
[
colon_index
+
1
:]):
...
@@ -317,8 +320,6 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
...
@@ -317,8 +320,6 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
else
:
else
:
startspeech
=
thattext
[
colon_index
+
1
:]
startspeech
=
thattext
[
colon_index
+
1
:]
# print(startspeech)
# write beginning of speech to second line
# write beginning of speech to second line
# (create new ET element if necessary)
# (create new ET element if necessary)
if
len
(
list
(
XML_new
[
ind_p
][
ind_t
]))
>
1
:
if
len
(
list
(
XML_new
[
ind_p
][
ind_t
]))
>
1
:
...
@@ -343,7 +344,6 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
...
@@ -343,7 +344,6 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
colon_index
=
thattext
.
index
(
'
:
'
)
colon_index
=
thattext
.
index
(
'
:
'
)
# get start of speech with correct font start
# get start of speech with correct font start
# print(thattext[colon_index+1:])
if
thattext
[
colon_index
+
1
:].
startswith
(
'
[font
'
):
if
thattext
[
colon_index
+
1
:].
startswith
(
'
[font
'
):
startspeech
=
thattext
[
colon_index
+
1
:]
startspeech
=
thattext
[
colon_index
+
1
:]
elif
re
.
match
(
'
^[ ]?\[/font\]$
'
,
thattext
[
colon_index
+
1
:]):
elif
re
.
match
(
'
^[ ]?\[/font\]$
'
,
thattext
[
colon_index
+
1
:]):
...
@@ -424,9 +424,9 @@ def flatten(l):
...
@@ -424,9 +424,9 @@ def flatten(l):
# - list_uniqueID: list with one or several uniqueIDs
# - list_uniqueID: list with one or several uniqueIDs
# - list_tupels: list of tupels containing all types of names
# - list_tupels: list of tupels containing all types of names
# TODO: double names in extra column: string gets written twice, uniqueID is fine (1893, 20026532, Cramer)
# TODO: double names in extra column: string gets written twice, uniqueID is fine (1893, 20026532, Cramer)
def
find_names
(
list_oi
,
df_names
,
list_notnames
,
bln_print
=
False
):
def
find_names
(
list_oi
,
list_roles
,
list_roles_ext
,
df_names
,
list_notnames
,
bln_print
=
False
):
def
get_string
(
term
,
str_name
,
str_role
,
list_uniqueID
,
str_canton
):
def
get_string
(
term
,
df_names
,
str_name
,
str_role
,
list_uniqueID
,
str_canton
):
name_type
=
''
name_type
=
''
# if it is one of the simple names
# if it is one of the simple names
if
term
in
list
(
df_names
[
'
name_short
'
].
loc
[
df_names
[
'
type
'
]
==
'
simple
'
]):
if
term
in
list
(
df_names
[
'
name_short
'
].
loc
[
df_names
[
'
type
'
]
==
'
simple
'
]):
...
@@ -457,8 +457,7 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
...
@@ -457,8 +457,7 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
str_name
=
add_to_string
(
str_name
,
correct_name
)
str_name
=
add_to_string
(
str_name
,
correct_name
)
name_type
=
'
comp
'
name_type
=
'
comp
'
# if it contains a canton
# if it contains a canton
# !!! also pass list_oi to look for canton
# TODO: how to handle for people mentioned in text???
# !!! how to handle for people mentioned in text???
elif
term
in
list
(
df_names
[
'
name_short
'
].
loc
[
df_names
[
'
type
'
]
==
'
canton
'
]):
elif
term
in
list
(
df_names
[
'
name_short
'
].
loc
[
df_names
[
'
type
'
]
==
'
canton
'
]):
if
bln_print
:
if
bln_print
:
print
(
'
contains a canton
'
,
term
)
print
(
'
contains a canton
'
,
term
)
...
@@ -480,46 +479,35 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
...
@@ -480,46 +479,35 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
print
(
list_temp
,
list_uniqueID
)
print
(
list_temp
,
list_uniqueID
)
print
(
type
(
list_temp
),
type
(
list_uniqueID
))
print
(
type
(
list_temp
),
type
(
list_uniqueID
))
print
(
isinstance
(
list_uniqueID
,
list
))
print
(
isinstance
(
list_uniqueID
,
list
))
# if no unique ID has been assigned so far
if
len
(
list_uniqueID
)
==
0
:
list_uniqueID
=
update_list_uniqueID
(
list_uniqueID
,
list_temp
)
list_uniqueID
=
list_temp
# if there are already one or several people and have a new person, we update
elif
len
(
list_uniqueID
)
>
0
and
len
(
set
(
list_temp
).
intersection
(
set
(
flatten
(
list_uniqueID
))))
==
0
:
list_uniqueID
.
append
(
list_temp
)
## if we already have several possible people, e.g. because of canton
#elif isinstance(int_uniqueID, tuple):
#print('I should be here')
## and refound the uniqueID of one of those, don't update
#if temp in int_uniqueID:
#pass
## and update if we don't have that uniqueID yet
#else:
#int_uniqueID = (int_uniqueID, temp)
## if a person with that uniqueID exists already, don't update
#elif isinstance(int(int_uniqueID), int) and temp == int_uniqueID:
#print('but end up here.. not even.....')
#pass
## if a different unique ID has been assigned already
#else:
#int_uniqueID = (int_uniqueID, temp)
return
str_name
,
str_role
,
list_uniqueID
,
name_type
return
str_name
,
str_role
,
list_uniqueID
,
name_type
def
update_list_uniqueID
(
list_uniqueID
,
list_temp
):
# if no unique ID has been assigned so far
if
len
(
list_uniqueID
)
==
0
:
list_uniqueID
=
list_temp
# if there are already one or several people and have a new person, we update
elif
len
(
list_uniqueID
)
>
0
and
len
(
set
(
list_temp
).
intersection
(
set
(
flatten
(
list_uniqueID
))))
==
0
:
list_uniqueID
.
append
(
list_temp
)
return
list_uniqueID
# function to find correct term (in case of misspellings, etc.)
# function to find correct term (in case of misspellings, etc.)
def
get_approximate_term
(
term
,
array_all
_names
):
def
get_approximate_term
(
term
,
array_all
):
# TODO: probably need to improve this procedure
# TODO: probably need to improve this procedure
# - find better values ....
# - find better values ....
# initialize string
# initialize string
term_approx
=
''
term_approx
=
''
# get normalize array
# get normalize
d
array
array_normalized
=
array_all
_names
[
normalized_damerau_levenshtein_distance_ndarray
(
term
,
array_all
_names
)
<=
0.35
]
array_normalized
=
array_all
[
normalized_damerau_levenshtein_distance_ndarray
(
term
,
array_all
)
<=
0.35
]
array_normalized_values
=
normalized_damerau_levenshtein_distance_ndarray
(
term
,
array_normalized
)
array_normalized_values
=
normalized_damerau_levenshtein_distance_ndarray
(
term
,
array_normalized
)
# get absolute array
# get absolute array
array_absolute
=
array_all
_names
[
damerau_levenshtein_distance_ndarray
(
term
,
array_all
_names
)
<=
2
]
array_absolute
=
array_all
[
damerau_levenshtein_distance_ndarray
(
term
,
array_all
)
<=
2
]
array_absolute_values
=
damerau_levenshtein_distance_ndarray
(
term
,
array_absolute
)
array_absolute_values
=
damerau_levenshtein_distance_ndarray
(
term
,
array_absolute
)
if
bln_print
:
if
bln_print
:
print
(
term
)
print
(
term
)
...
@@ -560,14 +548,6 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
...
@@ -560,14 +548,6 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
str_canton
=
''
str_canton
=
''
name_type
=
''
name_type
=
''
# lists of roles
list_roles
=
[
'
Präsident
'
,
'
Präsidentin
'
,
'
Vizepräsident
'
,
'
Präsidium
'
,
'
Président
'
,
'
Présidente
'
,
'
président
'
,
'
présidente
'
,
'
Berichterstatter
'
,
'
Berichterstatterin
'
,
'
rapporteur
'
,
'
Sprecher
'
,
'
Sprecherin
'
,
'
porte-parole
'
,
'
porteparole
'
,
'
Bundesrat
'
,
'
Bundesrath
'
,
'
Bundesrätin
'
,
'
conseiller fédéral
'
,
'
Vizepräsident
'
]
list_roles_ext
=
[
'
Mehrheit
'
,
'
Minderheit
'
,
'
majorité
'
,
'
minorité
'
,
'
deutscher
'
,
'
deutsche
'
,
'
français
'
,
'
française
'
,
'
Kommission
'
,
'
commission
'
]
# extract list and array of last names
# extract list and array of last names
list_all_names
=
list
(
df_names
[
'
name_short
'
])
list_all_names
=
list
(
df_names
[
'
name_short
'
])
array_all_names
=
np
.
array
(
df_names
[
'
name_short
'
])
array_all_names
=
np
.
array
(
df_names
[
'
name_short
'
])
...
@@ -580,18 +560,19 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
...
@@ -580,18 +560,19 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
if
term
in
list_roles
:
if
term
in
list_roles
:
# get correct name and uniqueID, or role, for that term
# get correct name and uniqueID, or role, for that term
str_name
,
str_role
,
list_uniqueID
,
name_type
=
get_string
(
term
,
str_name
,
str_role
,
list_uniqueID
,
str_canton
)
str_name
,
str_role
,
list_uniqueID
,
name_type
=
get_string
(
term
,
df_names
,
str_name
,
str_role
,
list_uniqueID
,
str_canton
)
if
bln_print
:
if
bln_print
:
print
(
'
found a role
'
,
term
)
print
(
'
found a role
'
,
term
)
# TODO: also look for similar terms (misspellings)
# TODO: also look for similar terms (misspellings)
# TODO: assign role in English, e.g. Berichterstatter and rapporteur become reporter
elif
term
in
list_roles_ext
:
elif
term
in
list_roles_ext
:
pass
pass
# TODO: extract whether it is minority or majority and save that information
# TODO: extract whether it is minority or majority and save that information
# can
not happen for the first term
# cannot happen for the first term
elif
name_type
==
'
canton
'
:
elif
name_type
==
'
canton
'
:
list_cantonname
,
list_cantonabbr
,
list_citizenship
,
list_firstname
=
get_list_cantons
(
df_names
,
str_name
.
split
(
'
'
)[
0
])
list_cantonname
,
list_cantonabbr
,
list_citizenship
,
list_firstname
=
get_list_cantons
(
df_names
,
str_name
.
split
(
'
'
)[
0
])
canton_type
=
''
canton_type
=
''
...
@@ -614,7 +595,6 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
...
@@ -614,7 +595,6 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
else
:
else
:
print
(
'
might be a canton
'
,
term
,
list_oi
,
str_name
,
str_role
)
print
(
'
might be a canton
'
,
term
,
list_oi
,
str_name
,
str_role
)
# TODO: maybe: go to next elif?
# if a canton or similar was found
# if a canton or similar was found
if
canton_type
:
if
canton_type
:
...
@@ -630,7 +610,7 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
...
@@ -630,7 +610,7 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
try
:
try
:
if
len
(
list_cities
)
==
1
:
if
len
(
list_cities
)
==
1
:
str_citizenship
=
list_cities
[
0
]
str_citizenship
=
list_cities
[
0
]
#
except:
except
:
print
(
'
found no or more than one person with citizenship
'
,
str_canton
,
str_name
)
print
(
'
found no or more than one person with citizenship
'
,
str_canton
,
str_name
)
pass
pass
...
@@ -639,7 +619,9 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
...
@@ -639,7 +619,9 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
else
:
else
:
list_temp
=
list
(
df_names
.
loc
[(
df_names
[
'
type
'
]
==
name_type
)
&
(
df_names
[
'
name_short
'
]
==
str_name
)
&
(
df_names
[
canton_type
]
==
str_canton
)].
iloc
[:,
df_names
.
columns
.
get_loc
(
'
uniqueIndex
'
)])
list_temp
=
list
(
df_names
.
loc
[(
df_names
[
'
type
'
]
==
name_type
)
&
(
df_names
[
'
name_short
'
]
==
str_name
)
&
(
df_names
[
canton_type
]
==
str_canton
)].
iloc
[:,
df_names
.
columns
.
get_loc
(
'
uniqueIndex
'
)])
print
(
list_temp
,
list_uniqueID
)
print
(
list_temp
,
list_uniqueID
)
list_uniqueID
=
list_temp
if
len
(
list_temp
)
>
0
:
list_uniqueID
=
update_list_uniqueID
(
list_uniqueID
,
list_temp
)
# if term is not easily mistaken as a name (avoid false positives)
# if term is not easily mistaken as a name (avoid false positives)
elif
term
not
in
list_notnames
:
elif
term
not
in
list_notnames
:
...
@@ -647,7 +629,7 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
...
@@ -647,7 +629,7 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
# if term is in the list of all names
# if term is in the list of all names
if
term
in
list_all_names
:
if
term
in
list_all_names
:
# get correct name and uniqueID, or role, for that term
# get correct name and uniqueID, or role, for that term
str_name
,
str_role
,
list_uniqueID
,
name_type
=
get_string
(
term
,
str_name
,
str_role
,
list_uniqueID
,
str_canton
)
str_name
,
str_role
,
list_uniqueID
,
name_type
=
get_string
(
term
,
df_names
,
str_name
,
str_role
,
list_uniqueID
,
str_canton
)
if
bln_print
:
if
bln_print
:
print
(
'
=== correct name
'
,
term
)
print
(
'
=== correct name
'
,
term
)
...
@@ -659,7 +641,7 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
...
@@ -659,7 +641,7 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
# if one was found, get correct name, etc.
# if one was found, get correct name, etc.
if
term_approx
:
if
term_approx
:
str_name
,
str_role
,
list_uniqueID
,
name_type
=
get_string
(
term_approx
,
str_name
,
str_role
,
list_uniqueID
,
str_canton
)
str_name
,
str_role
,
list_uniqueID
,
name_type
=
get_string
(
term_approx
,
df_names
,
str_name
,
str_role
,
list_uniqueID
,
str_canton
)
if
bln_print
:
if
bln_print
:
print
(
'
=== approximate name
'
,
str_name
,
term_approx
)
print
(
'
=== approximate name
'
,
str_name
,
term_approx
)
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment