diff --git a/src/python/bla_tryreadxml.py b/src/python/bla_tryreadxml.py new file mode 100644 index 0000000000000000000000000000000000000000..dcee82d9c9b6c507aafaa81df11b0b16335c710f --- /dev/null +++ b/src/python/bla_tryreadxml.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +#%% +import xml.etree.ElementTree as ET +import re + +xml_file = '/home/lili/Downloads/1891/20026448_datacorr.xml' + +XML_tree = ET.parse(xml_file) +XML_root = XML_tree.getroot() + + +# for every page +for ind_p, page in enumerate(XML_root): + print(page.tag, page.attrib) + # for every textbox on that page + for ind_t, textbox in enumerate(page): + if (textbox.tag == 'textbox'): + if 'type_textbox' in textbox.attrib.keys(): + if (textbox.attrib['type_textbox'] == 'text'): + print(textbox.tag, textbox.attrib) + + # for every textline in that textbox + for ind_tl, textline in enumerate(textbox): + if textline.tag == 'textline': + print(textline.tag, textline.attrib) + + print(get_text(textline.text)) + +#%% +sometext = '[font face="8.071" size="Times-Bold"]Für die Bedaktion verantwortlich :[/font][font face="7.973" size="Times-BoldItalic"] Sud. SdMarst[/font][font face="8.071" size="Times-Bold"] —• Druck und Expedition von[/font][font face="7.973" size="Times-BoldItalic"] Jmi è Éeineft[/font][font face="8.071" size="Times-Bold"] fa[/font][font face="7.973" size="Times-BoldItalic"] Seìrit. [/font]' + +#re.split('[ | ]', sometext) +def get_text(sometext): + newtext = '' + for text in re.findall('\].*?\[',sometext): + #print(text) + if text.startswith(']') and text.endswith('['): + newtext += text[1:-1] + #print(newtext) + return newtext +get_text(sometext) +#%% + + + # for every text (actually just a letter) +# for ind_ch, text in enumerate(textline): +# print(ind_ch, text.text) #, len(tex0tline), len(XML_new[ind_p][ind_t][ind_tl])) diff --git a/src/python/def_classes.py b/src/python/def_classes.py index 30917e30c4456fd7ffcbac48aa4f0c8899167019..afe3f906d197b500d5f2317e974b888178689967 100644 --- a/src/python/def_classes.py +++ b/src/python/def_classes.py @@ -710,6 +710,7 @@ class Document: self.name_outxml = name_outxml if 'XML_main_corr' not in self.__dict__.keys(): + print('no main corr') name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outcorrxml + '.tar.gz' if os.path.isfile(name_tar): name_xml = './' + str(self.year) + '/' + str(self.id_doc) + suffix_xml + 'corr.xml' @@ -717,25 +718,24 @@ class Document: h_xml = utils_proc.get_handlerfile(name_xml, self.folder_database, self.name_outcorrxml) XML_tree = ET.parse(h_xml) self.XML_main_corr = XML_tree.getroot() - else: - self.correct_xml(flag_plots = 0, flag_parallel = 0, flag_save_figs = 0, - pages = 'all', suffix_xml = '_data', name_outxml = self.name_outxml, - name_outcorrxml = self.name_outcorrxml) - else: - # TODO if already exists 02_extractedxml - self.correct_xml(flag_plots = 0, flag_parallel = 0, flag_save_figs = 0, - pages = 'all', suffix_xml = '_data', name_outxml = self.name_outxml, - name_outcorrxml = self.name_outcorrxml) - + #else: + #self.correct_xml(flag_plots = 0, flag_parallel = 0, flag_save_figs = 0, + #pages = 'all', suffix_xml = '_data', name_outxml = self.name_outxml, + #name_outcorrxml = self.name_outcorrxml) + #else: + ## TODO if already exists 02_extractedxml + #self.correct_xml(flag_plots = 0, flag_parallel = 0, flag_save_figs = 0, + #pages = 'all', suffix_xml = '_data', name_outxml = self.name_outxml, + #name_outcorrxml = self.name_outcorrxml) + + + print('we have a main corr XML file here') #utils_proc.tar_extractfile(self.name_xml_corr[1], self.folder_database, name_file = self.name_outcorrxml) XML_corr = utils_annot.get_text_corrected(self.XML_main_corr) - command = 'rm -rf ./' + str(self.year) #print(command) - utils_proc.call_with_out(command) - - return flag_discussion +# utils_proc.call_with_out(command) diff --git a/src/python/run_extract_discussions.py b/src/python/run_extract_discussions.py index d801bb064b76c5fdba50b488e869ff7c5fa3d01a..37ca9694c284a5c9bdbde04f189188147e31c6a6 100644 --- a/src/python/run_extract_discussions.py +++ b/src/python/run_extract_discussions.py @@ -21,7 +21,7 @@ import utils_annot # needed for running in atom, can be ignored input_lastnames = "data/politicians/lastnames/1891_lastnames.pickle" -input_correctedxml = "data/AB/1891/04_correctedxml.tar.gz" +input_correctedxml = "/home/lili/Downloads/1891/04_correctedxml.tar.gz" input_correctedmeta = "data/AB/1891/03_correctedmeta.tar.gz" output_annotatedxml = "data/AB/1891/05_annotatedxml.tar.gz" @@ -98,7 +98,7 @@ with open(input_lastnames, 'rb') as f: df_lastnames.columns #%% -file_tarpath = './1891/20026447_datacorr.xml' +file_tarpath = './1891/20026448_datacorr.xml' file_number = file_tarpath.split('/')[-1][:8] metafile_tarpath = './{}/{}{}.xml'.format(year, file_number, suffix_correctedmeta) @@ -114,8 +114,7 @@ if (file_doc.check_discussion()) and (file_number not in ['20032463', '20032952' file_doc.annotate_speakers() - - +#%% diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py index f2b44c2247286129e3472d53fc337855ccdd8c47..a9f27fa6c8b620bf4a6af2d208998e849dd32c92 100644 --- a/src/python/utils_annot.py +++ b/src/python/utils_annot.py @@ -12,6 +12,8 @@ import re import os, sys sys.path.append('src/python/') import utils_proc +import copy + # function to check whether a file containts discussions # achieved by excluding title pages, table of content, etc. @@ -47,9 +49,16 @@ def check_if_discussion(path_meta_xml_file, return True -# function to get text of corrected XML - +def get_text(sometext): + newtext = '' + for text in re.findall('\].*?\[',sometext): + #print(text) + if text.startswith(']') and text.endswith('['): + newtext += text[1:-1] + #print(newtext) + return newtext +# function to get text of corrected XML def get_text_corrected(XML_root): # create new XML as a copy of the corrected one @@ -58,7 +67,6 @@ def get_text_corrected(XML_root): # for every page for ind_p, page in enumerate(XML_root): print(page.tag, page.attrib) - # for every textbox on that page for ind_t, textbox in enumerate(page): if (textbox.tag == 'textbox'): @@ -70,17 +78,17 @@ def get_text_corrected(XML_root): for ind_tl, textline in enumerate(textbox): if textline.tag == 'textline': print(textline.tag, textline.attrib) - - print(textline.text) + print(len(textline.text)) + print(get_text(textline.text)) +# print('this is text', textline.text) - - ## for every text (actually just a letter) - #for ind_ch, text in enumerate(textline): - ##print(ind_ch, text.text, len(textline), len(XML_new[ind_p][ind_t][ind_tl])) + # for every text (actually just a letter) +# for ind_ch, text in enumerate(textline): + #print(ind_ch, text.text) #, len(textline), len(XML_new[ind_p][ind_t][ind_tl])) ## extend string #if 'font' in text.attrib.keys(): #if (text.attrib['font'] != prev_fonttype) or (text.attrib['size'] != str(prev_fontsize)): @@ -99,7 +107,6 @@ def get_text_corrected(XML_root): #complete_text = clean_text(complete_text) #XML_new[ind_p][ind_t][ind_tl].text = complete_text - return XML_new