Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
D
democrasci_preprocWP1
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Marta Balode
democrasci_preprocWP1
Commits
14eb9b77
Commit
14eb9b77
authored
6 years ago
by
Lili Gasser
Browse files
Options
Downloads
Patches
Plain Diff
implemented council and date disambiguation
parent
321563de
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
data/lists/wrongly_identified_speakers.txt
+1
-1
1 addition, 1 deletion
data/lists/wrongly_identified_speakers.txt
src/python/run_extract_discussions.py
+2
-3
2 additions, 3 deletions
src/python/run_extract_discussions.py
src/python/utils_annot.py
+56
-47
56 additions, 47 deletions
src/python/utils_annot.py
with
59 additions
and
51 deletions
data/lists/wrongly_identified_speakers.txt
+
1
−
1
View file @
14eb9b77
...
...
@@ -7,7 +7,7 @@ also check for council:
one MP not active in whole year, leads to other not uniquely identified
-----------------------------------------------------------------------
1925/20029860ff: Keller-Aargau (in March, there is only one Keller-Aargau, in June, another one joins --> finds two!)
1925/20029860ff: Keller-Aargau (in March, there is only one Keller-Aargau, in June, another one joins --> finds two!)
--> solved!
1925/20029967: Seiler (in December, the second Seiler already left) --> finds two!) --> solved!
1925/20029967: Huber (in December, the second Huber already left) --> finds two!) --> solved because only NR!
1925/20029882: Naine (in June, there is only one Naine, in December, another one joins --> finds two!) also in ...96, ...97, etc. --> solved!
...
...
This diff is collapsed.
Click to expand it.
src/python/run_extract_discussions.py
+
2
−
3
View file @
14eb9b77
...
...
@@ -141,7 +141,6 @@ file_doc.get_council_date()
#len(files_to_process)
file_doc
.
check_discussion
()
str_date
=
'
22.09.1925
'
str_date
=
'
1925-12-09 08:00
'
import
datetime
datetime
.
datetime
.
strptime
(
str_date
,
'
%
d.%m.%Y
'
)
datetime
.
datetime
.
strptime
(
str_date
,
'
%
Y-%m-%d %H:%M
'
)
This diff is collapsed.
Click to expand it.
src/python/utils_annot.py
+
56
−
47
View file @
14eb9b77
...
...
@@ -56,11 +56,11 @@ def get_council_and_date(path_meta_xml_file):
# parse, get root and then part of interest
XML_tree
=
ET
.
parse
(
path_meta_xml_file
)
XML_root
=
XML_tree
.
getroot
()
XML_poi
=
XML_root
[
0
]
.
find
(
'
META_FROM_DB
'
)
XML_poi
=
XML_root
[
0
]
# get council and date
str_council
=
XML_poi
.
attrib
[
'
RAT
'
]
str_date
=
XML_poi
.
attrib
[
'
DATUM
'
]
str_council
=
XML_poi
.
find
(
'
META_FROM_DB
'
).
attrib
[
'
RAT
'
]
str_date
=
XML_poi
.
attrib
[
'
PUBLIKATIONS_
DATUM
'
]
return
(
str_council
,
str_date
)
...
...
@@ -570,7 +570,7 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str
# cannot happen for the first term in list_oi
elif
name_type
==
'
canton
'
:
list_cantonname
,
list_cantonabbr
,
list_citizenship
,
list_firstname
,
list_additionalInfo
=
get_list_cantons
(
df_names
,
str_name
.
split
(
'
'
)[
0
])
list_cantonname
,
list_cantonabbr
,
list_citizenship
,
list_firstname
,
list_additionalInfo
=
get_list_cantons
(
df_names
,
str_name
.
split
(
'
'
)[
0
]
,
str_council
)
canton_type
=
''
if
term
in
list_cantonname
:
str_canton
=
term
...
...
@@ -608,10 +608,11 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str
# get rid of CANTON MISSING
str_name
=
str_name
.
split
(
'
'
)[
0
]
df_temp
=
get_df_temp_canton
(
df_names
,
str_name
,
str_council
)
# extract uniqueID
# if Citizenship,
do proper comparison
# if Citizenship,
get list of cities and compare each to term
if
canton_type
==
'
Citizenship
'
:
df_temp
=
df_names
.
loc
[(
df_names
[
'
nameType
'
]
==
name_type
)
&
(
df_names
[
'
shortName
'
]
==
str_name
)]
# get list of cities
list_cities
=
[
entry
for
entry
in
df_temp
[
canton_type
]
if
str_canton
in
get_cities
([
entry
])]
str_citizenship
=
''
try
:
...
...
@@ -621,12 +622,12 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str
print
(
'
found no or more than one person with citizenship
'
,
str_canton
,
str_name
)
pass
list_temp
=
list
(
df_
names
.
loc
[(
df_
names
[
'
nameType
'
]
==
name_type
)
&
(
df_
names
[
'
shortName
'
]
==
str_name
)
&
(
df_
names
[
canton_type
]
==
str_citizenship
)].
iloc
[:,
df_
names
.
columns
.
get_loc
(
'
uniqueIndex
'
)])
str_completeName
=
df_
names
[
'
completeName
'
].
loc
[(
df_
names
[
'
nameType
'
]
==
name_type
)
&
(
df_
names
[
'
shortName
'
]
==
str_name
)
&
(
df_
names
[
canton_type
]
==
str_citizenship
)].
iloc
[
0
]
list_temp
=
list
(
df_
temp
.
loc
[(
df_
temp
[
'
nameType
'
]
==
name_type
)
&
(
df_
temp
[
'
shortName
'
]
==
str_name
)
&
(
df_
temp
[
canton_type
]
==
str_citizenship
)].
iloc
[:,
df_
temp
.
columns
.
get_loc
(
'
uniqueIndex
'
)])
str_completeName
=
df_
temp
[
'
completeName
'
].
loc
[(
df_
temp
[
'
nameType
'
]
==
name_type
)
&
(
df_
temp
[
'
shortName
'
]
==
str_name
)
&
(
df_
temp
[
canton_type
]
==
str_citizenship
)].
iloc
[
0
]
else
:
list_temp
=
list
(
df_
names
.
loc
[(
df_
names
[
'
nameType
'
]
==
name_type
)
&
(
df_
names
[
'
shortName
'
]
==
str_name
)
&
(
df_
names
[
canton_type
]
==
str_canton
)].
iloc
[:,
df_
names
.
columns
.
get_loc
(
'
uniqueIndex
'
)])
str_completeName
=
df_
names
[
'
completeName
'
].
loc
[(
df_
names
[
'
nameType
'
]
==
name_type
)
&
(
df_
names
[
'
shortName
'
]
==
str_name
)
&
(
df_
names
[
canton_type
]
==
str_canton
)].
iloc
[
0
]
list_temp
=
list
(
df_
temp
.
loc
[(
df_
temp
[
'
nameType
'
]
==
name_type
)
&
(
df_
temp
[
'
shortName
'
]
==
str_name
)
&
(
df_
temp
[
canton_type
]
==
str_canton
)].
iloc
[:,
df_
temp
.
columns
.
get_loc
(
'
uniqueIndex
'
)])
str_completeName
=
df_
temp
[
'
completeName
'
].
loc
[(
df_
temp
[
'
nameType
'
]
==
name_type
)
&
(
df_
temp
[
'
shortName
'
]
==
str_name
)
&
(
df_
temp
[
canton_type
]
==
str_canton
)].
iloc
[
0
]
print
(
list_temp
,
list_uniqueID
)
...
...
@@ -662,10 +663,12 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str
# initial checks for not uniquely identified peoples
# TODO check for false positives of these procedures
if
name_type
==
'
canton
'
:
df_temp
=
df_names
.
loc
[(
df_names
[
'
nameType
'
]
==
name_type
)
&
(
df_names
[
'
shortName
'
]
==
str_name
.
split
(
'
'
)[
0
])]
# check if person can be identified from council
list_councils
=
list
(
df_temp
[
'
CouncilName
'
])
if
list_councils
.
count
(
str_council
)
==
1
:
df_temp
=
df_names
.
loc
[(
df_names
[
'
nameType
'
]
==
name_type
)
&
(
df_names
[
'
shortName
'
]
==
str_name
.
split
(
'
'
)[
0
])
&
(
df_names
[
'
CouncilName
'
]
==
str_council
)]
if
df_temp
.
shape
[
0
]
==
1
:
## check if person can be identified from council
#list_councils = list(df_temp['CouncilName'])
#if list_councils.count(str_council) == 1:
list_temp
=
list
(
df_names
.
loc
[(
df_names
[
'
nameType
'
]
==
name_type
)
&
(
df_names
[
'
shortName
'
]
==
str_name
.
split
(
'
'
)[
0
])
&
(
df_names
[
'
CouncilName
'
]
==
str_council
)].
iloc
[:,
df_names
.
columns
.
get_loc
(
'
uniqueIndex
'
)])
str_completeName
=
df_names
[
'
completeName
'
].
loc
[(
df_names
[
'
nameType
'
]
==
name_type
)
&
(
df_names
[
'
shortName
'
]
==
str_name
.
split
(
'
'
)[
0
])
&
(
df_names
[
'
CouncilName
'
]
==
str_council
)].
iloc
[
0
]
...
...
@@ -675,36 +678,35 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str
else
:
str_name
=
add_to_string
(
str_name
,
str_completeName
)
# check if person can be identified from date of discussion
# TODO: is input dataformat always the same?
df_temp_before
=
df_temp
[
pd
.
to_datetime
(
df_temp
[
'
DateJoining
'
])
<=
datetime
.
datetime
.
strptime
(
str_date
,
'
%d.%m.%Y
'
)]
# TODO: replace by (or add another condition) if df_temp_before.shape[0]
< d
f_temp.shape[0
]
if
df_temp_before
.
shape
[
0
]
==
1
:
list_temp
=
list
(
df_temp_before
[
'
uniqueIndex
'
])
str_completeName
=
df_temp_before
[
'
completeName
'
].
iloc
[
0
]
list_uniqueID
=
update_list_uniqueID
(
list_uniqueID
,
list_temp
,
name_type
)
if
str_completeName
.
split
(
'
'
)[
0
]
==
str_name
.
split
(
'
'
)[
0
]:
str_name
=
add_to_string
(
''
,
str_completeName
)
else
:
str_name
=
add_to_string
(
str_name
,
str_completeName
)
df_temp_after
=
df_temp
[
pd
.
to_datetime
(
df_temp
[
'
DateLeaving
'
])
>=
datetime
.
datetime
.
strptime
(
str_date
,
'
%d.%m.%Y
'
)]
# TODO: replace by (or add another condition) if df_temp_before.shape[0] < df_temp.shape[0
]
if
df_temp_after
.
shape
[
0
]
==
1
:
list_temp
=
list
(
df_temp_after
[
'
uniqueIndex
'
])
str_completeName
=
df_temp_after
[
'
completeName
'
].
iloc
[
0
]
list_uniqueID
=
update_list_uniqueID
(
list_uniqueID
,
list_temp
,
name_type
)
if
str_completeName
.
split
(
'
'
)[
0
]
==
str_name
.
split
(
'
'
)[
0
]:
str_name
=
add_to_string
(
''
,
str_completeName
)
else
:
str_name
=
add_to_string
(
str_name
,
str_completeName
)
print
(
str_date
,
df_temp_before
.
shape
,
df_temp_after
.
shape
)
else
:
# check if person can be identified from date of discussion
# exclude people that joined after date of discussion
df_temp_before
=
df_temp
[
pd
.
to_datetime
(
df_temp
[
'
DateJoining
'
])
<
=
d
atetime
.
datetime
.
strptime
(
str_date
,
'
%Y-%m-%d %H:%M
'
)
]
if
df_temp_before
.
shape
[
0
]
==
1
:
list_temp
=
list
(
df_temp_before
[
'
uniqueIndex
'
])
str_completeName
=
df_temp_before
[
'
completeName
'
].
iloc
[
0
]
list_uniqueID
=
update_list_uniqueID
(
list_uniqueID
,
list_temp
,
name_type
)
if
str_completeName
.
split
(
'
'
)[
0
]
==
str_name
.
split
(
'
'
)[
0
]:
str_name
=
add_to_string
(
''
,
str_completeName
)
else
:
str_name
=
add_to_string
(
str_name
,
str_completeName
)
# exclude people that left before date of discussion
df_temp_after
=
df_temp
[
pd
.
to_datetime
(
df_temp
[
'
DateLeaving
'
])
>=
datetime
.
datetime
.
strptime
(
str_date
,
'
%Y-%m-%d %H:%M
'
)
]
if
df_temp_after
.
shape
[
0
]
==
1
:
list_temp
=
list
(
df_temp_after
[
'
uniqueIndex
'
])
str_completeName
=
df_temp_after
[
'
completeName
'
].
iloc
[
0
]
list_uniqueID
=
update_list_uniqueID
(
list_uniqueID
,
list_temp
,
name_type
)
if
str_completeName
.
split
(
'
'
)[
0
]
==
str_name
.
split
(
'
'
)[
0
]:
str_name
=
add_to_string
(
''
,
str_completeName
)
else
:
str_name
=
add_to_string
(
str_name
,
str_completeName
)
print
(
str_date
,
df_temp
.
shape
,
df_temp_before
.
shape
,
df_temp_after
.
shape
)
# TODO: does this order make sense? council before date??
# TODO: function to update list unique ID and str_name
...
...
@@ -783,13 +785,20 @@ def label_language(XML_new, ind_p, ind_t, aux_dict_l):
def
get_cities
(
list_citizenship
):
return
[
city
[:
-
5
]
for
item
in
list_citizenship
for
city
in
item
.
split
(
'
,
'
)]
# function to get list of places
def
get_list_cantons
(
df_names
,
str_name
=
''
):
if
str_
name
:
df_temp
=
df_names
.
loc
[(
df_names
[
'
nameType
'
]
==
'
canton
'
)
&
(
df_names
[
'
shortName
'
]
==
str_name
)]
def
get_df_temp_canton
(
df_names
,
str_name
,
str_council
):
if
str_
council
in
[
'
Nationalrat
'
,
'
Ständerat
'
]
:
df_temp
=
df_names
.
loc
[(
df_names
[
'
nameType
'
]
==
'
canton
'
)
&
(
df_names
[
'
shortName
'
]
==
str_name
)
&
(
df_names
[
'
CouncilName
'
]
==
str_council
)
]
else
:
df_temp
=
df_names
.
loc
[
df_names
[
'
nameType
'
]
==
'
canton
'
]
#print(df_temp)
df_temp
=
df_names
.
loc
[(
df_names
[
'
nameType
'
]
==
'
canton
'
)
&
(
df_names
[
'
shortName
'
]
==
str_name
)]
return
df_temp
# function to get list of places
def
get_list_cantons
(
df_names
,
str_name
,
str_council
=
''
):
df_temp
=
get_df_temp_canton
(
df_names
,
str_name
,
str_council
)
# list of cantons
list_cantonname
=
list
(
df_temp
[
'
CantonName
'
])
# TODO this will lead to an error!
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment