Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
D
democrasci_preprocWP1
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Marta Balode
democrasci_preprocWP1
Commits
be210bb1
Commit
be210bb1
authored
6 years ago
by
Lili Gasser
Browse files
Options
Downloads
Patches
Plain Diff
cleaned, updated description, etc
parent
f36c26c4
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/python/extractMPs.py
+64
-63
64 additions, 63 deletions
src/python/extractMPs.py
with
64 additions
and
63 deletions
src/python/extractMPs.py
+
64
−
63
View file @
be210bb1
...
...
@@ -14,33 +14,36 @@ output_folder_dict = sys.argv[4] #'./data/politicians/lastnames/'
class
MPs_Extractor
(
object
):
def
__init__
(
self
,
years
,
input_file
,
output_file_csv
,
output_folder_dict
,
df_addInfo
):
def
__init__
(
self
,
years
,
input_file
,
input_file_addInfo
,
output_file_csv
,
output_folder_dict
):
self
.
input_file
=
input_file
self
.
input_file_addInfo
=
input_file_addInfo
self
.
output_file_csv
=
output_file_csv
self
.
output_folder_dict
=
output_folder_dict
self
.
range_years
=
range
(
years
[
0
],
years
[
1
]
+
1
)
self
.
df_addInfo
=
df_addInfo
# function to
get lists of lastnames
# function to
refine dataframe for name disambiguation
# input:
# - df_year: dataframe for a year
# output:
# TODO: update
# - list_names:
# contains:
# - list of last names that appear only once and cannot be split
# - list of last name that are made up of two names such as 'Meier-Müller'
# for each double name, four entries are made:
# - ('Meier', 'Meier-Müller')
# - ('Müller', 'Meier-Müller')
# - ('Meier-Müller', 'Meier-Müller')
# - ('MeierMüller', 'Meier-Müller')
# - list for composite last names such as 'von Arx' or 'de Stoppani'
# will be saved as ('Arx', 'von Arx')
# - list for people with the same last names
# will be saved as (lastname, lastname (canton)) for each person
# if the name is a composite name: ('Arx', 'von Arx (canton)')
def
get_list_of_lastnames
(
self
,
df_year
,
df_after1890
):
# - df_year: updated dataframe with shortName, completeName and nameType
# - complete name: lastname firstname (canton cantonabbr)
# - if lastname appears multiple times
# nameType = 'canton'
# shortname = according to following three types
# - if lastname is a composite name such as 'von Arx' or 'de Stoppani'
# nameType = 'comp'
# shortName = Arx or Stoppani
# - if lastname is a double name such as 'Meier-Müller'
# nameType = 'double'
# for each double name, four entries are made:
# - shortName = 'Meier'
# - shortName = 'Müller'
# - shortName = 'Meier-Müller'
# - shortName = 'MeierMüller'
# - if lastname is none of the above
# nameType = 'simple'
# shortName = lastname
def
refine_yearly_dataframe
(
self
,
df_year
):
str_simple
=
'
simple
'
str_double
=
'
double
'
str_comp
=
'
comp
'
...
...
@@ -48,31 +51,23 @@ class MPs_Extractor(object):
# function to split lastname and save meaningful part(s) to list
def
split_lastname
(
df_year
,
lastname
,
uniqueID
,
str_completeName
,
bln_unique
=
True
):
# if last name is a composite name, e.g. 'von Arx' and 'de Stoppani'
lastname_split
=
lastname
.
split
()
if
len
(
lastname_split
)
>
1
:
for
item
in
lastname_split
:
if
item
not
in
[
'
von
'
,
'
de
'
,
'
Ab
'
,
'
van
'
]:
# write distinctive item to extended list
df_year
.
loc
[(
df_year
[
'
uniqueIndex
'
]
==
uniqueID
),
'
shortName
'
]
=
item
df_year
.
loc
[(
df_year
[
'
uniqueIndex
'
]
==
uniqueID
),
'
completeName
'
]
=
str_completeName
if
bln_unique
:
df_year
.
loc
[(
df_year
[
'
uniqueIndex
'
]
==
uniqueID
),
'
nameType
'
]
=
str_comp
else
:
df_year
.
loc
[(
df_year
[
'
uniqueIndex
'
]
==
uniqueID
),
'
nameType
'
]
=
str_canton
else
:
# if last name is a double name, e.g. 'Meier-Müller'
lastname_split2
=
lastname
.
replace
(
'
-
'
,
'
'
).
split
()
if
len
(
lastname_split2
)
>
1
:
# set lastname and completename
df_year
.
loc
[(
df_year
[
'
uniqueIndex
'
]
==
uniqueID
),
'
shortName
'
]
=
lastname
df_year
.
loc
[(
df_year
[
'
uniqueIndex
'
]
==
uniqueID
),
'
completeName
'
]
=
str_completeName
# set nametype
if
bln_unique
:
df_year
.
loc
[(
df_year
[
'
uniqueIndex
'
]
==
uniqueID
),
'
nameType
'
]
=
str_double
else
:
df_year
.
loc
[(
df_year
[
'
uniqueIndex
'
]
==
uniqueID
),
'
nameType
'
]
=
str_canton
def
update_dataframe
(
df_year
,
str_shortName
,
str_completeName
,
str_nameType
,
bln_unique
):
# set short name
df_year
.
loc
[(
df_year
[
'
uniqueIndex
'
]
==
uniqueID
),
'
shortName
'
]
=
str_shortName
# set complete name
df_year
.
loc
[(
df_year
[
'
uniqueIndex
'
]
==
uniqueID
),
'
completeName
'
]
=
str_completeName
# set name type
if
bln_unique
:
df_year
.
loc
[(
df_year
[
'
uniqueIndex
'
]
==
uniqueID
),
'
nameType
'
]
=
str_nameType
else
:
df_year
.
loc
[(
df_year
[
'
uniqueIndex
'
]
==
uniqueID
),
'
nameType
'
]
=
str_canton
# generate three more entries for double names
if
str_nameType
==
'
double
'
:
# duplicate this entry three times
df_tripled
=
df_year
[
df_year
[
'
uniqueIndex
'
]
==
uniqueID
]
df_tripled
=
pd
.
concat
([
df_tripled
]
*
3
,
ignore_index
=
True
)
...
...
@@ -88,13 +83,26 @@ class MPs_Extractor(object):
# concatenate with yearly dataframe
df_year
=
pd
.
concat
([
df_year
,
df_tripled
],
ignore_index
=
True
)
return
df_year
# if last name is a composite name, e.g. 'von Arx' and 'de Stoppani'
lastname_split
=
lastname
.
split
()
if
len
(
lastname_split
)
>
1
:
for
item
in
lastname_split
:
if
item
not
in
[
'
von
'
,
'
de
'
,
'
Ab
'
,
'
van
'
]:
# update dataframe
df_year
=
update_dataframe
(
df_year
,
item
,
str_completeName
,
str_comp
,
bln_unique
)
else
:
# if last name is a double name, e.g. 'Meier-Müller'
lastname_split2
=
lastname
.
replace
(
'
-
'
,
'
'
).
split
()
if
len
(
lastname_split2
)
>
1
:
# update dataframe
df_year
=
update_dataframe
(
df_year
,
lastname
,
str_completeName
,
str_double
,
bln_unique
)
else
:
df_year
.
loc
[(
df_year
[
'
uniqueIndex
'
]
==
uniqueID
),
'
shortName
'
]
=
lastname
df_year
.
loc
[(
df_year
[
'
uniqueIndex
'
]
==
uniqueID
),
'
completeName
'
]
=
str_completeName
if
bln_unique
:
df_year
.
loc
[(
df_year
[
'
uniqueIndex
'
]
==
uniqueID
),
'
nameType
'
]
=
str_simple
else
:
df_year
.
loc
[(
df_year
[
'
uniqueIndex
'
]
==
uniqueID
),
'
nameType
'
]
=
str_canton
# update dataframe
df_year
=
update_dataframe
(
df_year
,
lastname
,
str_completeName
,
str_simple
,
bln_unique
)
return
df_year
...
...
@@ -175,7 +183,8 @@ class MPs_Extractor(object):
_df_after1890
.
loc
[
list_index
,
'
uniqueIndex
'
]
=
list_index
[
0
]
# some people need additional information such as a place or a second last name to be uniquely identified
for
row
in
self
.
df_addInfo
.
itertuples
(
index
=
False
,
name
=
'
Pandas
'
):
df_addInfo
=
pd
.
read_csv
(
self
.
input_file_addInfo
)
for
row
in
df_addInfo
.
itertuples
(
index
=
False
,
name
=
'
Pandas
'
):
_df_after1890
.
loc
[(
_df_after1890
[
'
LastName
'
]
==
row
[
0
])
&
(
_df_after1890
[
'
FirstName
'
]
==
row
[
1
]),
'
additionalInfo
'
]
=
row
[
3
]
# write dataframe to csv
...
...
@@ -195,26 +204,18 @@ class MPs_Extractor(object):
df_year
=
df_year
.
assign
(
shortName
=
''
)
df_year
=
df_year
.
assign
(
completeName
=
''
)
# write df_year to a yearly csv file
# str_year = str(year)
# df_year.to_csv('home/lili/NLP_DemocraSci/nlp-democracy/output/MPs/MPs_' + str_year + '.csv')
# create a pandas dataframe from list of names
# refine yearly dataframe for name disambiguation
# !!! list contains errors, see definition of function
df_year
=
self
.
get_list_of_lastn
ame
s
(
df_year
,
_df_after1890
)
print
(
df_year
)
df_year
=
self
.
refine_yearly_datafr
ame
(
df_year
)
#
print(df_year)
# dump dictionary of last names to a pickle file
# path = pathlib.
with
open
(
self
.
output_folder_dict
+
str
(
year
)
+
"
_lastnames.pickle
"
,
'
wb
'
)
as
f
:
pickle
.
dump
(
df_year
,
f
)
# years of interest
years
=
[
1891
,
1893
]
#2016
# open additional info file
df_addInfo
=
pd
.
read_csv
(
input_file_addInfo
)
years
=
[
1891
,
2016
]
#2016
mps_extractor
=
MPs_Extractor
(
years
,
input_file
,
output_file_csv
,
output_folder_dict
,
df_addInfo
)
mps_extractor
=
MPs_Extractor
(
years
,
input_file
,
input_file_addInfo
,
output_file_csv
,
output_folder_dict
)
mps_extractor
.
extract
()
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment