Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
D
democrasci_preprocWP1
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Marta Balode
democrasci_preprocWP1
Commits
59b1071d
Commit
59b1071d
authored
6 years ago
by
Lili Gasser
Browse files
Options
Downloads
Patches
Plain Diff
WIP trying to split speaker from text
parent
ae40dd41
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
src/python/run_extract_discussions.py
+1
-1
1 addition, 1 deletion
src/python/run_extract_discussions.py
src/python/utils_annot.py
+41
-7
41 additions, 7 deletions
src/python/utils_annot.py
with
42 additions
and
8 deletions
src/python/run_extract_discussions.py
+
1
−
1
View file @
59b1071d
...
@@ -86,7 +86,7 @@ for file_tarpath in files_to_process[66:]:
...
@@ -86,7 +86,7 @@ for file_tarpath in files_to_process[66:]:
# Commands to get the compressed version of the file
# Commands to get the compressed version of the file
#data/AB/${year}/0
2_extrac
tedxml.tar.gz
#data/AB/${year}/0
5_annota
tedxml.tar.gz
utils_proc
.
compress_tar
(
output_annotatedxml
)
utils_proc
.
compress_tar
(
output_annotatedxml
)
...
...
This diff is collapsed.
Click to expand it.
src/python/utils_annot.py
+
41
−
7
View file @
59b1071d
...
@@ -94,7 +94,7 @@ def get_annotated_xml(XML_root, df_lastnames):
...
@@ -94,7 +94,7 @@ def get_annotated_xml(XML_root, df_lastnames):
print
(
textbox
.
tag
,
textbox
.
attrib
)
print
(
textbox
.
tag
,
textbox
.
attrib
)
# get complete text of that textbox
# get complete text of that textbox
complete_text
=
get_complete_text
(
textbox
)
complete_text
,
ind_tl_colon
=
get_complete_text
(
textbox
)
# identify and label language in XML
# identify and label language in XML
dict_lang
=
identify_language
(
complete_text
)
dict_lang
=
identify_language
(
complete_text
)
...
@@ -109,7 +109,7 @@ def get_annotated_xml(XML_root, df_lastnames):
...
@@ -109,7 +109,7 @@ def get_annotated_xml(XML_root, df_lastnames):
print
(
complete_text
)
print
(
complete_text
)
XML_new
,
this_is_speech
=
label_speechstart
(
XML_new
,
ind_p
,
ind_t
,
complete_text
,
df_lastnames
,
list_stopwords
,
bln_print
=
False
)
XML_new
,
this_is_speech
=
label_speechstart
(
XML_new
,
ind_p
,
ind_t
,
complete_text
,
ind_tl_colon
,
df_lastnames
,
list_stopwords
,
bln_print
=
False
)
if
this_is_speech
:
if
this_is_speech
:
prev_is_speech
=
True
prev_is_speech
=
True
# print('stopped after finding speech start')
# print('stopped after finding speech start')
...
@@ -166,6 +166,7 @@ def get_textbox_type(textbox):
...
@@ -166,6 +166,7 @@ def get_textbox_type(textbox):
# - textbox
# - textbox
# output:
# output:
# - complete_text: string
# - complete_text: string
# - ind_tl_colon: index of textline with colon (needed for label speech start)
def
get_complete_text
(
textbox
):
def
get_complete_text
(
textbox
):
# helper function to get text without font information
# helper function to get text without font information
...
@@ -177,16 +178,28 @@ def get_complete_text(textbox):
...
@@ -177,16 +178,28 @@ def get_complete_text(textbox):
newtext
+=
text
[
1
:
-
1
]
newtext
+=
text
[
1
:
-
1
]
#print(newtext)
#print(newtext)
return
newtext
return
newtext
# initialize empty string
# initialize empty string
complete_text
=
''
complete_text
=
''
# initialize index of textline colon to impossible value
ind_tl_colon
=
-
1
# for every textline in that textbox
# for every textline in that textbox
for
ind_tl
,
textline
in
enumerate
(
textbox
):
for
ind_tl
,
textline
in
enumerate
(
textbox
):
if
textline
.
tag
==
'
textline
'
:
if
textline
.
tag
==
'
textline
'
:
# append text to string
# get that text
complete_text
+=
get_text
(
textline
.
text
)
thattext
=
get_text
(
textline
.
text
)
# append that text to string
complete_text
+=
thattext
return
complete_text
# in first two textlines of textbox, check for colon
if
ind_tl
<
3
:
if
'
:
'
in
thattext
:
ind_tl_colon
=
ind_tl
return
complete_text
,
ind_tl_colon
# function to label speech starts
# function to label speech starts
...
@@ -197,7 +210,7 @@ def get_complete_text(textbox):
...
@@ -197,7 +210,7 @@ def get_complete_text(textbox):
# - bln_print: whether to print during execution, default False
# - bln_print: whether to print during execution, default False
# output:
# output:
# - (str_name, str_role, list_uniqueID, str_canton): tuple with strings and ID
# - (str_name, str_role, list_uniqueID, str_canton): tuple with strings and ID
def
label_speechstart
(
XML_new
,
ind_p
,
ind_t
,
text
,
df_names
,
list_stopwords
,
bln_print
=
False
):
def
label_speechstart
(
XML_new
,
ind_p
,
ind_t
,
text
,
ind_tl_colon
,
df_names
,
list_stopwords
,
bln_print
=
False
):
# initialize flag
# initialize flag
this_is_speech
=
False
this_is_speech
=
False
...
@@ -211,7 +224,8 @@ def label_speechstart(XML_new, ind_p, ind_t, text, df_names, list_stopwords, bln
...
@@ -211,7 +224,8 @@ def label_speechstart(XML_new, ind_p, ind_t, text, df_names, list_stopwords, bln
# very consistently, a speaker can be identified by looking for a colon
# very consistently, a speaker can be identified by looking for a colon
# at the beginning of a textbox and identifiying a name or a role in front
# at the beginning of a textbox and identifiying a name or a role in front
# of that colon
# of that colon
if
'
:
'
in
text
[:
100
]:
if
ind_tl_colon
>=
0
:
# if ':' in text[:100]:
# extract the index of the colon in the text
# extract the index of the colon in the text
colon_index_text
=
text
.
index
(
'
:
'
)
colon_index_text
=
text
.
index
(
'
:
'
)
...
@@ -264,7 +278,27 @@ def label_speechstart(XML_new, ind_p, ind_t, text, df_names, list_stopwords, bln
...
@@ -264,7 +278,27 @@ def label_speechstart(XML_new, ind_p, ind_t, text, df_names, list_stopwords, bln
# add speaker to first textline
# add speaker to first textline
XML_new
[
ind_p
][
ind_t
][
0
].
attrib
[
'
speaker
'
]
=
(
str_name
,
str_role
,
list_uniqueID
,
str_canton
)
XML_new
[
ind_p
][
ind_t
][
0
].
attrib
[
'
speaker
'
]
=
(
str_name
,
str_role
,
list_uniqueID
,
str_canton
)
# TODO: split speaker from text (check on which line and split that line accordingly)
# TODO: split speaker from text (check on which line and split that line accordingly)
# TODO account for splitting of [font ...] ... [/font]
if
ind_tl_colon
==
0
:
thattext
=
XML_new
[
ind_p
][
ind_t
][
0
].
text
colon_index
=
thattext
.
index
(
'
:
'
)
try
:
XML_new
[
ind_p
][
ind_t
][
0
].
text
=
thattext
[:
colon_index
+
1
]
XML_new
[
ind_p
][
ind_t
][
1
].
text
=
thattext
[
colon_index
+
1
:]
+
'
'
+
XML_new
[
ind_p
][
ind_t
][
1
].
text
except
:
print
(
'
error in self.input_file when splitting speaker
'
)
pass
if
ind_tl_colon
==
1
:
thattext
=
XML_new
[
ind_p
][
ind_t
][
1
].
text
colon_index
=
thattext
.
index
(
'
:
'
)
XML_new
[
ind_p
][
ind_t
][
0
].
text
=
XML_new
[
ind_p
][
ind_t
][
0
].
text
+
'
'
+
thattext
[:
colon_index
+
1
]
XML_new
[
ind_p
][
ind_t
][
1
].
text
=
thattext
[
colon_index
+
1
:]
# dict_discussionstarts[(key, str_name, str_role, int_uniqueID)] = (text[:colon_index_text],
# dict_discussionstarts[(key, str_name, str_role, int_uniqueID)] = (text[:colon_index_text],
# text[colon_index_text+1:])
# text[colon_index_text+1:])
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment