From 5620cf730fe42270c61a31483cfd5301f003bfde Mon Sep 17 00:00:00 2001 From: Luis Salamanca <luis.salamanca@sdsc.ethz.ch> Date: Thu, 29 Nov 2018 16:06:36 +0100 Subject: [PATCH] Copying all files from democrasci project --- notebooks/RunningClasses.ipynb | 195 ++++++ notebooks/browse_politicians.ipynb | 146 +++++ src/python/def_classes.py | 478 +++++++++++++++ src/python/extractMPs.py | 231 +++++++ src/python/plot_tools.py | 257 ++++++++ src/python/preproc_docs.py | 853 ++++++++++++++++++++++++++ src/python/run_correct_meta.py | 54 ++ src/python/run_correctxml.py | 54 ++ src/python/run_extract_discussions.py | 199 ++++++ src/python/run_extract_origxml.py | 53 ++ src/python/utils_annot.py | 600 ++++++++++++++++++ src/python/utils_proc.py | 193 ++++++ src/sh/execute_per_year.sh | 16 + 13 files changed, 3329 insertions(+) create mode 100644 notebooks/RunningClasses.ipynb create mode 100644 notebooks/browse_politicians.ipynb create mode 100644 src/python/def_classes.py create mode 100644 src/python/extractMPs.py create mode 100644 src/python/plot_tools.py create mode 100644 src/python/preproc_docs.py create mode 100644 src/python/run_correct_meta.py create mode 100644 src/python/run_correctxml.py create mode 100644 src/python/run_extract_discussions.py create mode 100644 src/python/run_extract_origxml.py create mode 100644 src/python/utils_annot.py create mode 100644 src/python/utils_proc.py create mode 100755 src/sh/execute_per_year.sh diff --git a/notebooks/RunningClasses.ipynb b/notebooks/RunningClasses.ipynb new file mode 100644 index 00000000..be66f978 --- /dev/null +++ b/notebooks/RunningClasses.ipynb @@ -0,0 +1,195 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Simple notebook to test the class" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import os, sys\n", + "\n", + "sys.path.append('../src/python/')\n", + "\n", + "import def_classes as defc\n", + "import numpy as np\n", + "from pdf2image import convert_from_path, convert_from_bytes\n", + "import utils_proc" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "input_file = './1891/20026455.pdf'\n", + "folder_database = '../data/'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create the object from the class Document\n", + "Here in principle we should load the file without extension, but it is prepared for that in any case" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "d1 = defc.Document(input_file, folder_database)\n", + "print(d1.year, d1.id_doc, d1.input_file, d1.path_file)\n", + "d1.pdf2imgobj()\n", + "print(len(d1.imgobj))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Obtain extracted xml from pdf\n", + "Obtain the xml and save it, also storing the name in the object" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "d1.pdf2xml()\n", + "print(d1.name_xml, d1.n_pages)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(len(d1.XML_main))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compute corrected xml file\n", + "This can be directly, without running the step before, because if the xml file is missing, it is just extracted" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "d1.correct_xml(flag_plots = 0, flag_parallel = 0, flag_save_figs = 0,\n", + " pages = 'all', suffix_xml = '_data')\n", + "print(d1.name_xml_corr, len(d1.XML_main))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "d1.n_pages" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## And finally some plots" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "d1.plot_orig_textb(range_pages = [0], suffix_xml = '_data', flag_plot = 1, flag_save_figs = 0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "d1.plot_margins_doc(range_pages = [0], suffix_xml = '_data', flag_plot = 1, flag_save_figs = 0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "d1.plot_boxes_labels(range_pages = [10], suffix_xml = '_data', flag_plot = 1, flag_save_figs = 0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "d1.plot_textl_ordered(range_pages = [5], suffix_xml = '_data', flag_plot = 1, flag_save_figs = 0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "d1.plot_XMLcorrect(range_pages = [8], suffix_xml = '_data', flag_plot = 1, flag_save_figs = 0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/browse_politicians.ipynb b/notebooks/browse_politicians.ipynb new file mode 100644 index 00000000..bd591c90 --- /dev/null +++ b/notebooks/browse_politicians.ipynb @@ -0,0 +1,146 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Browse dictionary of politicians" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pickle" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'/home/jovyan/democrasci/notebooks'" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "os.getcwd()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"../data/politicians/MPs_lastnames_test.pickle\", \"rb\") as f:\n", + " foo = pickle.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(foo)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys([1891, 1892, 1893])" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "foo.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('Abegg', 'Abegg', 7, 'Zürich', 'ZH', 'Küsnacht (ZH)', 'Johann Jakob'), ('Ador', 'Ador', 23, 'Genf', 'GE', 'Vuiteboeuf (VD),Genf (GE)', 'Gustave'), ('Aeby', 'Aeby', 36, 'Freiburg', 'FR', 'Freiburg (FR)', 'Paul'), ('Amstad', 'Amstad', 131, 'Nidwalden', 'NW', 'Beckenried (NW)', 'Joh. Josef M.'), ('Bachmann', 'Bachmann', 219, 'Thurgau', 'TG', 'Stettfurt (TG)', 'Jakob H.'), ('Bähler', 'Bähler', 236, 'Bern', 'BE', 'Längenbühl (BE)', 'Traugott-Philipp Eduard'), ('Baldinger', 'Baldinger', 238, 'Aargau', 'AG', 'Baden (AG)', 'Emil A.'), ('Balli', 'Balli', 242, 'Tessin', 'TI', 'Cavergno (TI)', 'Francesco'), ('Bangerter', 'Bangerter', 253, 'Bern', 'BE', 'Lyss (BE)', 'Gottfried'), ('Baud', 'Baud', 280, 'Waadt', 'VD', 'Apples (VD)', 'Charles-H.-A.')]\n", + "7\n", + "[('Cramer', 'Cramer', 1150, 'Zürich', 'ZH', 'Zürich (ZH),Volken (ZH)', 'Konrad H.'), ('Frey', 'Cramer', 1150, 'Zürich', 'ZH', 'Zürich (ZH),Volken (ZH)', 'Konrad H.'), ('Cramer-Frey', 'Cramer', 1150, 'Zürich', 'ZH', 'Zürich (ZH),Volken (ZH)', 'Konrad H.'), ('CramerFrey', 'Cramer', 1150, 'Zürich', 'ZH', 'Zürich (ZH),Volken (ZH)', 'Konrad H.'), ('Jordan', 'Jordan', 2792, 'Waadt', 'VD', 'Granges-près-Marnand (VD)', 'Adolphe'), ('Martin', 'Jordan', 2792, 'Waadt', 'VD', 'Granges-près-Marnand (VD)', 'Adolphe'), ('Jordan-Martin', 'Jordan', 2792, 'Waadt', 'VD', 'Granges-près-Marnand (VD)', 'Adolphe'), ('JordanMartin', 'Jordan', 2792, 'Waadt', 'VD', 'Granges-près-Marnand (VD)', 'Adolphe'), ('Scherrer', 'Scherrer-Füllemann', 4582, 'St. Gallen', 'SG', 'Kirchberg (SG)', 'Joseph Anton'), ('Füllemann', 'Scherrer-Füllemann', 4582, 'St. Gallen', 'SG', 'Kirchberg (SG)', 'Joseph Anton')]\n", + "7\n", + "[('Chastonay', 'de Chastonay', 1211, 'Wallis', 'VS', 'Siders (VS)', 'Victor'), ('Schaller', 'de Schaller', 1252, 'Freiburg', 'FR', 'Corminboeuf (FR),Freiburg (FR)', 'Henri Gaspard'), ('Stoppani', 'de Stoppani', 1260, 'Tessin', 'TI', 'Ponte Tresa (TI)', 'Leone'), ('Werra', 'de Werra', 1266, 'Wallis', 'VS', 'St-Maurice (VS)', 'Charles'), ('Wuilleret', 'de Wuilleret', 1270, 'Freiburg', 'FR', 'Romont (FR),Freiburg (FR)', 'Louis'), ('Arx', 'von Arx', 5438, 'Solothurn', 'SO', 'Olten (SO)', 'Casimir'), ('Matt', 'von Matt', 5464, 'Nidwalden', 'NW', 'Stans (NW)', 'Hans sen.'), ('Roten', 'von Roten', 5482, 'Wallis', 'VS', 'Raron (VS)', 'Hans Anton'), ('Steiger', 'von Steiger', 5507, 'Bern', 'BE', 'Bern (BE)', 'K. Fr. Edmund')]\n", + "7\n", + "[('Blumer', 'Blumer (Zürich)', 510, 'Zürich', 'ZH', 'Embrach (ZH),Glarus (GL)', 'Othmar'), ('Blumer', 'Blumer (St. Gallen)', 516, 'St. Gallen', 'SG', 'Glarus (GL)', 'Johannes'), ('Bühler', 'Bühler (Graubünden)', 807, 'Graubünden', 'GR', 'Davos (GR)', 'Peter Theophil'), ('Bühler', 'Bühler (Bern)', 810, 'Bern', 'BE', 'Aeschi b. Spiez (BE)', 'Arnold Gottlieb'), ('Bühler', 'Bühler (Zürich)', 812, 'Zürich', 'ZH', 'Hombrechtikon (ZH)', 'Joh. Heinrich'), ('Dufour', 'Dufour (Genf)', 1420, 'Genf', 'GE', 'Genf (GE)', 'Jean-Etienne'), ('Dufour', 'Dufour (Bern)', 1421, 'Bern', 'BE', 'Genf (GE)', 'Guillaume-Henri'), ('Good', 'Good (St. Gallen)', 2101, 'St. Gallen', 'SG', 'Mels (SG)', 'Wilhelm'), ('Good', 'Good (St. Gallen)', 2102, 'St. Gallen', 'SG', 'Mels (SG)', 'Karl Friedrich'), ('Hauser', 'Hauser (Bern)', 2378, 'Bern', 'BE', 'Wädenswil (ZH),Rüti b. Riggisberg (BE)', 'Johann Jakob')]\n", + "7\n" + ] + } + ], + "source": [ + "for elm in foo[1891]:\n", + " print(elm[:10])\n", + " print(len(elm[0]))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/python/def_classes.py b/src/python/def_classes.py new file mode 100644 index 00000000..2ead54ab --- /dev/null +++ b/src/python/def_classes.py @@ -0,0 +1,478 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Sep 28 13:31:06 2018 + +@author: luissalamanca +""" + +import sys, os + +from colour import Color +import matplotlib.image as mpimg +from mpl_toolkits.mplot3d import Axes3D +import matplotlib.pyplot as plt +import numpy as np +import xml.etree.ElementTree as ET +import copy +import time +import tarfile +import pickle + +from pdf2image import convert_from_path, convert_from_bytes + +import utils_proc +import plot_tools +import preproc_docs + + + + + +# Definition of classes and methods associated + +class Document: + + limit_year = 1950 + flag_end_run = 1 + name_inpdf = '00_rawpdfs' + name_inmeta = '01_rawmeta' + + def __init__(self, input_file, folder_database): + self.year = int(input_file.split('/')[-2]) + self.id_doc = input_file.split('/')[-1].split('.')[0] + self.input_file = input_file + _, self.name_file = os.path.split(input_file) + self.path_file = folder_database + str(self.year) + '/' + self.name_wo_ext = os.path.splitext(self.name_file)[0] + self.folder_database = folder_database + self._meta_ext() + + def _meta_ext(self): + # Both for the correction and the extraction of the metadata information + name_file = str(self.year) + '/' + self.id_doc + '.xml' + name_file_db = str(self.year) + '/' + self.id_doc + '.db' + name_tar = self.folder_database + str(self.year) + '/' + self.name_inmeta + '.tar.gz' + self.name_meta = [name_tar, name_file, name_file_db] + + def meta_correct(self, name_outmeta = '03_correctedmeta'): + utils_proc.tar_extractfile(self.name_meta[1], self.folder_database, name_file = self.name_inmeta) + utils_proc.tar_extractfile(self.name_meta[2], self.folder_database, name_file = self.name_inmeta) + name_meta_corr = utils_proc.correct_metadata(self.year, self.id_doc, self.flag_end_run) + name_tar = utils_proc.addto_tar(name_meta_corr, self.folder_database, name_file = name_outmeta) + self.name_outmeta = name_outmeta + command = 'rm -rf ./' + str(self.year) + #print(command) + utils_proc.call_with_out(command) + + def pdf2imgobj(self, resolution = 100): + + self.resolution = resolution + utils_proc.tar_extractfile(self.input_file, self.folder_database, name_file = self.name_inpdf) + self.imgobj = convert_from_path(self.input_file, dpi = resolution) + command = 'rm -rf ./' + str(self.year) + utils_proc.call_with_out(command) + + def _get_pages(self, pages = 'all'): + if pages == 'all': + self.n_pages = np.arange(len(self.imgobj)) + elif isinstance(pages,str): + self.n_pages = np.array(pages.split(',')).astype(np.uint32) + else: + self.n_pages = np.array(pages) + + def pdf2xml(self, pages = 'all', suffix_xml = '_data', flag_save = 1, + name_outxml = '02_extractedxml'): + # To extract the embedded text of the pdf into an xml file + if 'imgobj' not in self.__dict__.keys(): + self.pdf2imgobj() + self._get_pages(pages = pages) + + utils_proc.tar_extractfile(self.input_file, self.folder_database, name_file = self.name_inpdf) + name_xml = utils_proc.pdf2xml(self.input_file, page_n = self.n_pages + 1, suffix_str = suffix_xml, + flag_end = self.flag_end_run) + if flag_save: + name_tar = utils_proc.addto_tar(name_xml, self.folder_database, name_file = name_outxml) + else: + name_tar = self.folder_database + '/' + str(self.year) + '/' + name_outxml + '.tar.gz' + + command = 'rm -rf ./' + str(self.year) + #print(command) + utils_proc.call_with_out(command) + self.name_xml = [name_tar, name_xml] + h_xml = utils_proc.get_handlerfile(self.name_xml[1], self.folder_database, name_file = name_outxml) + self.name_outxml = name_outxml + XML_tree = ET.parse(h_xml) + self.XML_main = XML_tree.getroot() + self.n_pages = np.arange(len(self.XML_main)) + + + def _draw_textbl(self, imarray = np.array([]), XML_root = None, XML_main = None, + ind_page = 0, textb_textl = 1): + # The page refers here to the page of the imgobj, which might not correspond + # to the one of the xml. For that reason we use n_pages to obtain the index + # for the xml + # textb_textl = 1 for textboxes, and 2 for textlines + if (XML_root == None) and (XML_main == None): + return print('Not possible! - You need to provide a valid XML\n') + if np.sum(imarray.shape) == 0: + if 'imgobj' not in self.__dict__.keys(): + imarray = np.array(self.imgobj[ind_page]) + else: + return print('Not possible! - You need to convert first the pdf to image\n') + + if XML_root == None: + XML_root = ET.Element('pages') + ind_abs = np.argwhere(self.n_pages == ind_page) + XML_root.append(XML_main[ind_abs]) + + bbox_page = np.array(XML_root[0].attrib['bbox'].split(',')).astype(np.float64) + + imarray_textb = np.copy(imarray) + + if textb_textl == 1: + coord_textboxes = np.array([]).reshape((4,0)) + for ind_el in range(0, len(XML_root[0])): + if XML_root[0][ind_el].tag == 'textbox': + coord_textbox_aux = np.array(XML_root[0][ind_el].attrib['bbox'].split(',')).astype(np.float64) + coord_textboxes = np.concatenate((coord_textboxes, np.array(coord_textbox_aux).reshape((4,1))), axis = 1) + imarray_textb = plot_tools.highlight_text(imarray_textb, coord_textbox_aux, + bbox_page, color_vec = 'blue', alpha = True, + filled = False, thick_line = 6) + return imarray_textb, coord_textboxes + elif textb_textl == 2: + imarray_textl = np.copy(imarray) + coord_textline = np.array([]).reshape((4,0)) + all_font_sizes = np.array([]) + for ind_el in range(0, len(XML_root[0])): + for ind_line in range(0, len(XML_root[0][ind_el])): + if XML_root[0][ind_el][ind_line].tag == 'textline': + coord_textline_aux = np.array(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')).astype(np.float64) + if len(XML_root[0][ind_el][ind_line]): + all_font_sizes = np.concatenate((all_font_sizes, + np.array([XML_root[0][ind_el][ind_line][0].attrib['size']]).astype(np.float64))) + coord_textline = np.concatenate((coord_textline, np.array(coord_textline_aux).reshape((4,1))), axis = 1) + imarray_textl = plot_tools.highlight_text(imarray_textl, coord_textline_aux, bbox_page, + color_vec = 'red', alpha = True, filled = False, thick_line = 6) + + all_font_sizes, counts_all_font_sizes = np.unique(all_font_sizes, return_counts=True) + info_font_sizes = np.concatenate((all_font_sizes.reshape((1,all_font_sizes.shape[0])), + counts_all_font_sizes.reshape((1,all_font_sizes.shape[0])).astype(np.float64))) + + return imarray_textb, coord_textline, all_font_sizes, info_font_sizes + + def correct_xml(self, flag_plots = 1, flag_parallel = 0, flag_save_figs = 1, + pages = 'all', suffix_xml = '_data', name_outxml = '02_extractedxml', + name_outcorrxml = '04_correctedxml'): + + if 'name_outxml' not in self.__dict__.keys(): + self.name_outxml = name_outxml + + start_time = time.time() + if 'imgobj' not in self.__dict__.keys(): + self.pdf2imgobj() + + if 'XML_main' not in self.__dict__.keys(): + name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outxml + '.tar.gz' + if os.path.isfile(name_tar): + name_xml = './' + str(self.year) + '/' + str(self.id_doc) + suffix_xml + '.xml' + if name_xml in utils_proc.get_list(self.year, self.folder_database, self.name_outxml)[0]: + h_xml = utils_proc.get_handlerfile(name_xml, self.folder_database, self.name_outxml) + XML_tree = ET.parse(h_xml) + self.XML_main = XML_tree.getroot() + else: + # TODO if already exists 02_extractedxml + self.pdf2xml(pages = pages, suffix_xml = suffix_xml) + + self._get_pages(pages = pages) + flag_central = 1 + if self.year > self.limit_year: + flag_central = 0 + flag_2col = 1 + + XML_new = ET.Element('pages') + + for ind_abs, ind_page in enumerate(self.n_pages): + + XML_root = ET.Element('pages') + #print(ind_abs,len(self.XML_main)) + XML_root.append(self.XML_main[ind_abs]) + imarray = np.array(self.imgobj[ind_page]) + + if XML_root[0][0].tag == 'textbox': + bbox_page = np.array(XML_root[0].attrib['bbox'].split(',')).astype(np.float64) + dim_img = imarray.shape[:2] + _, rescale_factor = plot_tools.adapt_coordtoimg(imarray, bbox_page, bbox_page) + + # Image with textboxes highlighted + imarray_textblock, coord_textboxes = self._draw_textbl(imarray = imarray, XML_root = XML_root) + + # Image with textlines highlighted, BUT also, array with all textlines + # coordinates, and the fontsizes, required for later + _, coord_textline, all_font_sizes, info_font_sizes = self._draw_textbl(imarray = imarray, XML_root = XML_root, + textb_textl = 2) + + ##### + # Central vertical line and horizontal lines, through Hough transform + coord_vert_def, coord_horz = preproc_docs.find_mainHorandCentral_Hough(np.copy(imarray), coord_textline, bbox_page, + flag_2col, flag_central) + + ##### + # Obtain lateral margins + margins = preproc_docs.lateral_margins(imarray, bbox_page, coord_vert_def.astype(np.uint32), + coord_horz.astype(np.uint32)) + + # Top and bottom line + ind_limits = preproc_docs.bottomtop_margins(imarray, bbox_page, coord_vert_def.astype(np.uint32), + coord_horz.astype(np.uint32)) + + ##### + # Label the textboxes based on a set of simple rules that make use of + # the margins and the fontsizes + label_textlines, list_allcoords_textlines, relative_ref_textline, all_heights, vec_labels_textline = \ + preproc_docs.label_textblocks(np.copy(imarray), XML_root[0], bbox_page, margins, coord_vert_def, info_font_sizes) # info_font_sizes_est + + ##### + # Order the textlines, taken all them together, in order to later merge + # in a single textbox textlines that so far form different textboxes + set_of_blocks, centrall_ord = preproc_docs.order_textl(rescale_factor, coord_vert_def, coord_horz, + list_allcoords_textlines, margins) + + # Given the ordered textlines, group them in new textboxes, creating a + # XML, This uses some criteria of distance between paragraphs + XML_enrich = preproc_docs.group_textl_create_xml(XML_root, set_of_blocks, relative_ref_textline, vec_labels_textline, + rescale_factor, centrall_ord, ind_page, dim_img) + + # Append to the new XML + XML_new.append(XML_enrich[0]) + + + if flag_plots: + im_met2 = plot_tools.plot_horzvertlines(imarray_textblock, coord_horz, coord_vert_def) + im_met2 = plot_tools.plot_margins(im_met2, margins, ind_limits, gap_line = 1) + im_met3 = plot_tools.plot_labelled_boxes(imarray,label_textlines, list_allcoords_textlines) + im_met4 = plot_tools.plot_orderedtextl(imarray,set_of_blocks,list_allcoords_textlines.shape[1]) + im_met5 = plot_tools.plot_correctedXML(imarray, XML_enrich, bbox_page) + + # Create figure with 4 subplots, for showing all results + if flag_save_figs: + path_output_img = self.path_file + '/previews' + if flag_save_figs: + if not os.path.exists(path_output_img): + os.makedirs(path_output_img) + + if flag_parallel: + if flag_save_figs: + name_pickle = path_output_img + '/' + self.name_wo_ext + '_page' + str(ind_page) + '.pkl' + with open(name_pickle, 'wb') as f: # Python 3: open(..., 'wb') + pickle.dump([im_met2, im_met3, im_met4, im_met5], f) + + else: + fig, axes = plt.subplots(1, 4, figsize=(30, 10)) + ax = axes.ravel() + ax[0].axis('off') + ax[0].imshow(im_met2) + ax[1].axis('off') + ax[1].imshow(im_met3) + ax[2].axis('off') + ax[2].imshow(im_met4) + ax[3].axis('off') + ax[3].imshow(im_met5) + + if flag_save_figs: + format_fig = 'png' + name_fig = path_output_img + '/' + self.name_wo_ext + '_page' + str(ind_page) + '.' + format_fig + fig.savefig(name_fig, format = format_fig, dpi = 200) + plt.close(fig) + + name_xml_prev = str(self.year) + '/' + self.name_wo_ext + suffix_xml + 'corrprev.xml' + + tree = ET.ElementTree(XML_new) + self.XML_main_corr = XML_new + if not os.path.exists('./' + str(self.year)): + os.makedirs('./' + str(self.year)) + tree.write(name_xml_prev, encoding = 'utf-8') + XML_new = preproc_docs.get_text_onefile(self.XML_main_corr) + name_xml = str(self.year) + '/' + self.name_wo_ext + suffix_xml + 'corr.xml' + tree = ET.ElementTree(XML_new) + tree.write(name_xml, encoding = 'utf-8-sig') + + name_tar = utils_proc.addto_tar(name_xml, self.folder_database, name_file = name_outcorrxml) + self.name_outcorrxml = name_outcorrxml + self.name_xml_corr = [name_tar, name_xml] + command = 'rm -rf ./' + str(self.year) + #print(command) + utils_proc.call_with_out(command) + + print("End of file %s - %s seconds -" % (self.input_file, (time.time() - start_time))) + + #XML_tree = ET.parse(name_xml) + #self.XML_main = XML_tree.getroot() + + def _plot_generic_open(self, ind_page, suffix_xml, level_proc = 0, + name_outxml = '02_extractedxml'): + # ind_page has to be a scalar + + + if 'XML_main' not in self.__dict__.keys(): + name_tar = self.folder_database + '/' + str(self.year) + '/' + name_outxml + '.tar.gz' + if os.path.isfile(name_tar): + name_xml = './' + str(self.year) + '/' + str(self.id_doc) + suffix_xml + '.xml' + if name_xml in utils_proc.get_list(self.year, self.folder_database, name_outxml)[0]: + h_xml = utils_proc.get_handlerfile(name_xml, self.folder_database, name_outxml) + XML_tree = ET.parse(h_xml) + self.XML_main = XML_tree.getroot() + else: + self.pdf2xml(pages = 'all', suffix_xml = suffix_xml, flag_save = 0) + ind_abs = ind_page.reshape((-1,)).astype(int) + else: + #print('Run this') + ind_abs = np.argwhere(self.n_pages == ind_page).reshape((-1,)) + #print(ind_abs, type(ind_abs)) + #print(self.XML_main, len(self.imgobj)) + + XML_root = ET.Element('pages') + XML_root.append(self.XML_main[ind_abs[0]]) + imarray = np.array(self.imgobj[ind_page]) + + bbox_page = np.array(XML_root[0].attrib['bbox'].split(',')).astype(np.float64) + dim_img = imarray.shape[:2] + + _, coord_textline, all_font_sizes, info_font_sizes = self._draw_textbl(imarray = imarray, XML_root = XML_root, + textb_textl = 2) + margins = [] + ind_limits = [] + label_textlines = [] + list_allcoords_textlines = [] + set_of_blocks = [] + XML_enrich = [] + + if level_proc > 0: + coord_vert_def, coord_horz = preproc_docs.find_mainHorandCentral_Hough(np.copy(imarray), coord_textline, bbox_page, + flag_2col = 1) + + if level_proc > 1: + _, rescale_factor = preproc_docs.adapt_coordtoimg(imarray, bbox_page, bbox_page) + + if level_proc > 2: + ##### + # Obtain lateral margins + margins = preproc_docs.lateral_margins(imarray, bbox_page, coord_vert_def.astype(np.uint32), + coord_horz.astype(np.uint32)) + + if level_proc > 3: + # Top and bottom line + ind_limits = preproc_docs.bottomtop_margins(imarray, bbox_page, coord_vert_def.astype(np.uint32), + coord_horz.astype(np.uint32)) + + if level_proc > 4: + label_textlines, list_allcoords_textlines, relative_ref_textline, all_heights, vec_labels_textline = \ + preproc_docs.label_textblocks(np.copy(imarray), XML_root[0], bbox_page, margins, coord_vert_def, info_font_sizes) + + if level_proc > 5: + set_of_blocks, centrall_ord = preproc_docs.order_textl(rescale_factor, coord_vert_def, coord_horz, + list_allcoords_textlines, margins) + + if level_proc > 6: + XML_enrich = preproc_docs.group_textl_create_xml(XML_root, set_of_blocks, relative_ref_textline, vec_labels_textline, + rescale_factor, centrall_ord, ind_page, dim_img) + + return imarray, margins, ind_limits, label_textlines, list_allcoords_textlines, \ + set_of_blocks, XML_enrich, bbox_page, XML_root, ind_abs + + + def plot_orig_textb(self, range_pages = range(1), suffix_xml = '_data', + flag_plot = 1, flag_save_figs = 0, name_outxml = '02_extractedxml'): + + if 'name_outxml' not in self.__dict__.keys(): + self.name_outxml = name_outxml + + for ind_page in range_pages: + imarray, margins, ind_limits, _, _, \ + _, _, _, XML_root, _ = self._plot_generic_open(ind_page, suffix_xml, level_proc = 0, + name_outxml = self.name_outxml) + + imarray_textblock, _ = self._draw_textbl(imarray = imarray, XML_root = XML_root) + + self._plot_save(imarray_textblock, 'Textboxes original', 'OrigTextb', ind_page, self.path_file, + flag_plot, flag_save_figs) + + def plot_margins_doc(self, range_pages = range(1), suffix_xml = '_data', + flag_plot = 1, flag_save_figs = 0, name_outxml = '02_extractedxml'): + + if 'name_outxml' not in self.__dict__.keys(): + self.name_outxml = name_outxml + + for ind_page in range_pages: + imarray, margins, ind_limits, _, _, \ + _, _, _, _, _ = self._plot_generic_open(ind_page, suffix_xml, level_proc = 4, + name_outxml = self.name_outxml) + + im_met = plot_tools.plot_margins(imarray, margins, ind_limits, gap_line = 1) + + self._plot_save(im_met, 'Page margins', 'Margins', ind_page, self.path_file, + flag_plot, flag_save_figs) + + def plot_boxes_labels(self, range_pages = range(1), suffix_xml = '_data', + flag_plot = 1, flag_save_figs = 0, name_outxml = '02_extractedxml'): + + if 'name_outxml' not in self.__dict__.keys(): + self.name_outxml = name_outxml + + for ind_page in range_pages: + imarray, _, _, label_textlines, list_allcoords_textlines, _, _, _, _, _ = \ + self._plot_generic_open(ind_page, suffix_xml, level_proc = 5, + name_outxml = self.name_outxml) + + im_met = plot_tools.plot_labelled_boxes(imarray,label_textlines, list_allcoords_textlines) + + self._plot_save(im_met, 'Textboxes labelled', 'TextbLabel', ind_page, self.path_file, + flag_plot, flag_save_figs) + + def plot_textl_ordered(self, range_pages = range(1), suffix_xml = '_data', + flag_plot = 1, flag_save_figs = 0, name_outxml = '02_extractedxml'): + + if 'name_outxml' not in self.__dict__.keys(): + self.name_outxml = name_outxml + + for ind_page in range_pages: + imarray, _, _, _, list_allcoords_textlines, set_of_blocks, _, _, _, _ = \ + self._plot_generic_open(ind_page, suffix_xml, level_proc = 6, + name_outxml = self.name_outxml) + + im_met = plot_tools.plot_orderedtextl(imarray,set_of_blocks,list_allcoords_textlines.shape[1]) + + self._plot_save(im_met, 'Textlines ordered', 'TextlOrder', ind_page, self.path_file, + flag_plot, flag_save_figs) + + def plot_XMLcorrect(self, range_pages = range(1), suffix_xml = '_data', + flag_plot = 1, flag_save_figs = 0, name_outxml = '02_extractedxml'): + + if 'name_outxml' not in self.__dict__.keys(): + self.name_outxml = name_outxml + + for ind_page in range_pages: + imarray, _, _, _, _, _, XML_enrich, bbox_page, _, _ = \ + self._plot_generic_open(ind_page, suffix_xml, level_proc = 7, + name_outxml = self.name_outxml) + + im_met = plot_tools.plot_correctedXML(imarray, XML_enrich, bbox_page) + + self._plot_save(im_met, 'XML corrected', 'XMLcorrect', ind_page, self.path_file, + flag_plot, flag_save_figs) + + def _plot_save(self, im_met, str_title, str_name, ind_page, folder_save = '', + flag_plot = 1, flag_save_figs = 0, dpi = 200): + if flag_plot: + fig, axes = plt.subplots(1, 1, figsize=(8, 10)) + axes.axis('off') + axes.imshow(im_met) + plt.title(str_title) + if flag_save_figs: + format_fig = 'png' + name_fig = (folder_save + '/' + str_name + '_' + str(self.id_doc) + + '_page' + str(ind_page) + '.' + format_fig) + fig.savefig(name_fig, format = format_fig, dpi = dpi) + plt.close(fig) + + \ No newline at end of file diff --git a/src/python/extractMPs.py b/src/python/extractMPs.py new file mode 100644 index 00000000..cd3504da --- /dev/null +++ b/src/python/extractMPs.py @@ -0,0 +1,231 @@ +#!/usr/bin/env python3 + +import pathlib +import pandas as pd +import datetime +import pickle +import sys + +input_file = sys.argv[1] #'./data/politicians/Ratsmitglieder_1848_DE_corr.xlsx' +output_file_csv = sys.argv[2] #'./data/politicians/MPs_after1890.csv' +output_folder_dict = sys.argv[3] + + +class MPs_Extractor(object): + + def __init__(self, years, input_file, output_file_csv, output_folder_dict, df_exc): + self.input_file = input_file + self.output_file_csv = output_file_csv + self.output_folder_dict = output_folder_dict + self.range_years = range(years[0], years[1] + 1) + self.df_exc = df_exc + + # function to get lists of lastnames + # input: + # - df_year: dataframe for a year + # output: + # - list_names: + # contains: + # - list of last names that appear only once and cannot be split + # - list of last name that are made up of two names such as 'Meier-Müller' + # for each double name, four entries are made: + # - ('Meier', 'Meier-Müller') + # - ('Müller', 'Meier-Müller') + # - ('Meier-Müller', 'Meier-Müller') + # - ('MeierMüller', 'Meier-Müller') + # - list for composite last names such as 'von Arx' or 'de Stoppani' + # will be saved as ('Arx', 'von Arx') + # - list for people with the same last names + # will be saved as (lastname, lastname (canton)) for each person + # if the name is a composite name: ('Arx', 'von Arx (canton)') + def get_list_of_lastnames(self, df_year, df_after1890): + str_simple = 'simple' + str_double = 'double' + str_comp = 'comp' + str_canton2 = 'canton' + + # function to split lastname and save meaningful part(s) to list + def split_lastname(lastname, uniqueID, tpl_canton, str_canton = ''): + # if last name is a composite name, e.g. 'von Arx' and 'de Stoppani' + lastname_split = lastname.split() + if len(lastname_split) > 1: + for item in lastname_split: + if item not in ['von', 'de', 'Ab', 'van']: + # write distinctive item to extended list + if str_canton: + list_names.append((str_canton2, item, str_canton, uniqueID) + tpl_canton) + else: + list_names.append((str_comp, item, lastname, uniqueID) + tpl_canton) + else: + # if last name is a double name, e.g. 'Meier-Müller' + lastname_split2 = lastname.replace('-', ' ').split() + if len(lastname_split2) > 1: + # write each part of double name into corresponding list + for item in lastname_split2: + if str_canton: + list_names.append((str_canton2, item, str_canton, uniqueID) + tpl_canton) + else: + list_names.append((str_double, item, lastname, uniqueID) + tpl_canton) + # write double name into list + list_names.append((str_double, lastname, lastname, uniqueID) + tpl_canton) + # write double name without space into list + list_names.append((str_double, ''.join(lastname.split('-')), lastname, uniqueID) + tpl_canton) + else: + if str_canton: + list_names.append((str_canton2, lastname, str_canton, uniqueID) + tpl_canton) + else: + list_names.append((str_simple, lastname, lastname, uniqueID) + tpl_canton) + + # function to get canton and citizenship for uniqueID + def get_canton(df_year, uniqueID): + str_cantonname = df_year['CantonName'].loc[df_year['uniqueIndex']==uniqueID].iloc[0] + str_cantonabbr = df_year['CantonAbbreviation'].loc[df_year['uniqueIndex']==uniqueID].iloc[0] + str_citizenship = df_year['Citizenship'].loc[df_year['uniqueIndex']==uniqueID].iloc[0] + str_firstname = df_year['FirstName'].loc[df_year['uniqueIndex']==uniqueID].iloc[0] + + return (str_cantonname, str_cantonabbr, str_citizenship, str_firstname) + + # create empty lists for last names + list_names = [] + + # for every last name + for lastname in df_year['LastName'].drop_duplicates(): + #print('name', lastname, type(lastname)) + + # extract all entries with that last name + df_temp = df_year.loc[df_after1890['LastName']==lastname] + #print(df_temp) + + # if there is an extra double name + if df_temp.iloc[0]['DoubleName'] != '': + # extract unique index + uniqueID = df_temp.iloc[0]['uniqueIndex'] + + # get canton information for that uniqueID + tpl_canton = get_canton(df_year, uniqueID) + + #print('double name', df_temp) + doublename = df_temp.iloc[0]['DoubleName'] + + # if last name is a double name, e.g. 'Meier-Müller' + lastname_split2 = doublename.replace('-', ' ').split() + if len(lastname_split2) > 1: + # write each part of double name into corresponding list + for item in lastname_split2: + list_names.append((str_double, item, lastname, uniqueID) + tpl_canton) + # write double name into list + list_names.append((str_double, doublename, lastname, uniqueID) + tpl_canton) + # write double name without space into list + list_names.append((str_double, ''.join(doublename.split('-')), lastname, uniqueID) + tpl_canton) + + # if only one person with that last name + if df_temp.drop_duplicates(['uniqueIndex']).shape[0] == 1: + # extract unique index + uniqueID = df_temp.iloc[0]['uniqueIndex'] + + # get canton information for that uniqueID + tpl_canton = get_canton(df_year, uniqueID) + + # write complete name to list of last names + split_lastname(lastname, uniqueID, tpl_canton) + + # if there are several people with the same last name + else: + # write last name and canton to correct list + for idx, row in df_temp.drop_duplicates(['uniqueIndex']).iterrows(): + # extract unique index + uniqueID = df_temp.loc[idx]['uniqueIndex'] + + # get canton information for that uniqueID + tpl_canton = get_canton(df_year, uniqueID) + + # write the lastname to the list + split_lastname(lastname, uniqueID, tpl_canton, row['LastName'] + ' (' + row['CantonName'] + ')') + + return list_names + + def extract(self): + # read excel file and save first sheet to a dataframe + xl = pd.ExcelFile(self.input_file) + str_sheetname = xl.sheet_names[0] + orddict = xl.parse([str_sheetname]) + df = orddict[str_sheetname] + + # drop duplicate entries + df = df.drop_duplicates() + + # extract all people participating after 1890 + # starting from 1891 + df1 = df[pd.to_datetime(df['DateLeaving']) > datetime.datetime(1890, 12, 31)] + # get rid of people with wrong DateLeaving that is encoded as 1899-12-30 00:00:00 instead of dd.mm.yyyy + df1 = df1[df1['DateLeaving'].str.len() == 10] + # current members + df2 = df[df['DateLeaving'].isnull()] + # combine these dataframes + df_after1890 = pd.concat([df1, df2]) + + # generate unique ID for every person + # generate two now columns + df_after1890 = df_after1890.assign(uniqueIndex=0) + df_after1890 = df_after1890.assign(DoubleName='') + + # group by first and last name, and date of birth + grouped = df_after1890.groupby(["LastName", "FirstName", "DateOfBirth"]) + + # assign first index to all entries of a person + for list_index in grouped.groups.values(): + df_after1890.loc[list_index, 'uniqueIndex'] = list_index[0] + + # some people are referred to by their double-name, add these double-namse to extra-column + for row in self.df_exc.itertuples(index=False, name='Pandas'): + df_after1890.loc[(df_after1890['LastName'] == row[0]) & (df_after1890['FirstName'] == row[1]), 'DoubleName'] = row[2] + +# print(df_after1890.loc[df_after1890['DoubleName'] != '']) + # write dataframe to csv + df_after1890.to_csv(self.output_file_csv) + + # for every year + for year in self.range_years: + + # extract every MP that was active in that year + # (every MP of a year joined before the end of the and left after the beginning of the year) + df_year = df_after1890[pd.to_datetime(df_after1890['DateJoining']) <= datetime.datetime(year, 12, 31, 0, 0)] + df_year = df_year[pd.to_datetime(df_year['DateLeaving']) >= datetime.datetime(year, 1, 1, 0, 0)] + print(year, df_year.shape) + + # write df_year to a yearly csv file + # str_year = str(year) + # df_year.to_csv('home/lili/NLP_DemocraSci/nlp-democracy/output/MPs/MPs_' + str_year + '.csv') + + # create a pandas dataframe from list of names + # !!! list contains errors, see definition of function + list_lastnames = self.get_list_of_lastnames(df_year, df_after1890) + df_lastnames = pd.DataFrame(list_lastnames, columns = ('type', 'name_short', 'name_correct', 'uniqueIndex', 'CantonName', 'CantonAbbreviation', 'Citizenship', 'FirstName')) + + # dump dictionary of last names to a pickle file +# path = pathlib. + with open(self.output_folder_dict + str(year) + "_lastnames.pickle", 'wb') as f: + pickle.dump(df_lastnames, f) + + +# years of interest +years = [1891, 2016] #2016 + +df_exc = pd.DataFrame(columns=['LastName', 'FirstName', 'DoubleName']) +# exception: Konrad H. Cramer is also reffered to as Cramer-Frey. Add double name in extra-column +df_exc.loc[len(df_exc)] = {'LastName': 'Cramer', 'FirstName': 'Konrad H.', 'DoubleName': 'Cramer-Frey'} +# exception: Johannes Blumer SG is also reffered to as Blumer-Egloff. Add double name in extra-column +df_exc.loc[len(df_exc)] = {'LastName': 'Blumer', 'FirstName': 'Johannes', 'DoubleName': 'Blumer-Egloff'} +# exception: Adolphe Jordan VD is also reffered to as Jordan-Martin. Add double name in extra-column +df_exc.loc[len(df_exc)] = {'LastName': 'Jordan', 'FirstName': 'Adolphe', 'DoubleName': 'Jordan-Martin'} +# exception: Jakob Schmid LU is also reffered to as Schmid-Ronca. Add double name in extra-column +df_exc.loc[len(df_exc)] = {'LastName': 'Schmid', 'FirstName': 'Jakob', 'DoubleName': 'Schmid-Ronca'} +# exception: Eduard Sulzer ZH is also reffered to as Sulzer-Ziegler. Add double name in extra-column +df_exc.loc[len(df_exc)] = {'LastName': 'Sulzer', 'FirstName': 'Eduard', 'DoubleName': 'Sulzer-Ziegler'} +# exception: Howard Eugster AR is also reffered to as Eugster-Züst. Add double name in extra-column +df_exc.loc[len(df_exc)] = {'LastName': 'Eugster', 'FirstName': 'Howard', 'DoubleName': 'Eugster-Züst'} +#print(df_exc) + +mps_extractor = MPs_Extractor(years, input_file, output_file_csv, output_folder_dict, df_exc) +mps_extractor.extract() + diff --git a/src/python/plot_tools.py b/src/python/plot_tools.py new file mode 100644 index 00000000..f016182e --- /dev/null +++ b/src/python/plot_tools.py @@ -0,0 +1,257 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Sep 28 14:57:53 2018 + +@author: luissalamanca +""" + +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri May 4 09:55:26 2018 + +@author: luissalamanca +""" + + +from colour import Color +import numpy as np + +import os +import pickle +import re + +import matplotlib.pyplot as plt + +HEIGHT_CHAR = 12 +WIDTH_CHAR = 6 + +def lines_box(img, coord, color_vec, thick_line): + # Here we also consider the reference of: ref00 = toplet, and + # corners in topleftbottomright, and rows and colums + if isinstance(color_vec,str): + color_vec = np.uint8(np.array(Color(color_vec).rgb)*255) + + img[(coord[0]):(coord[0]+thick_line),coord[1]:coord[3],:] = color_vec + img[(coord[2]-thick_line):(coord[2]),coord[1]:coord[3],:] = color_vec + img[coord[0]:coord[2],(coord[1]):(coord[1]+thick_line),:] = color_vec + img[coord[0]:coord[2],(coord[3]-thick_line):coord[3],:] = color_vec + return img + +def transform_coord(coord, dim_page = np.array([3000,1800]), invert_xy = False, + rescale = False, scale_fact = np.array([1, 1]), ref00 = 'topleft', + refCorners = 'topleftbottomright'): + # scale_fact larger than 1 if we want to upscale, or smaller if the other + # way around + # Also, if the reference for the bbox are not + if coord.ndim == 1: + coord.resize((4,1)) + if invert_xy: + index_p = [1, 0, 3, 2] + coord = coord[index_p, :] + if ref00 == 'bottomleft': + coord[0, :] = dim_page[0] - coord[0, :] + coord[2, :] = dim_page[0] - coord[2, :] + if rescale: + if scale_fact.shape[0] == 1: + scale_fact = scale_fact.repeat(4); + elif scale_fact.shape[0] == 2: + scale_fact.resize((1,2)) + scale_fact = np.tile(scale_fact, 2); + scale_fact = np.resize(scale_fact,(4,1)) + coord = (coord*scale_fact).astype(np.uint32) + if refCorners == 'bottomlefttopright': + index_s = [2, 1, 0, 3] + coord = coord[index_s, :] + + return coord + +def transform_coord_toorig(coord, dim_page = np.array([3000,1800]), invert_xy = False, + rescale = False, scale_fact = np.array([1, 1]), ref00 = 'bottomleft', + refCorners = 'bottomlefttopright'): + # scale_fact larger than 1 if we want to upscale, or smaller if the other + # way around + # Also, if the reference for the bbox are not + + if refCorners == 'topleftbottomright': + index_s = [2, 1, 0, 3] + coord = coord[index_s, :] + if ref00 == 'topleft': + coord[0, :] = dim_page[0] - coord[0, :] + coord[2, :] = dim_page[0] - coord[2, :] + if rescale: + if scale_fact.shape[0] == 1: + scale_fact = scale_fact.repeat(4); + elif scale_fact.shape[0] == 2: + scale_fact.resize((1,2)) + scale_fact = np.tile(scale_fact, 2); + scale_fact = np.resize(scale_fact,(4,1)) + coord = (coord/scale_fact).astype(np.float64) + if invert_xy: + index_p = [1, 0, 3, 2] + coord = coord[index_p, :] + + return coord + +def adapt_coordtoimg(img, coord, dim_bbox_page): + dim_page_pix = np.array(img.shape) + dim_page_pix.resize((img.ndim,1)) + + dim_bbox_page = transform_coord(dim_bbox_page, invert_xy = True) + rescale_factor = dim_page_pix[0:2]/dim_bbox_page[2:] + + coord = transform_coord(coord, dim_page = dim_bbox_page[2:], + invert_xy = True, ref00 = 'bottomleft', rescale = True, + scale_fact = rescale_factor, refCorners = 'bottomlefttopright') + return coord, rescale_factor + +def highlight_text(img, coord, dim_bbox_page, color_vec = np.array([255, 0, 0], dtype = np.uint8), alpha = False, filled = True, thick_line = 3): + # In coord, we have the coordinates as bottom left corner and top right corner, + # ref00 in the bottom left, and the order of x and y inverted. + # But here, we do need the coordinates as ref00 in the top left, and the + # corners in the top left and bottom right, and ordered in rows and columns, + # i.e., y and x. Plus, all rescaled! Thus, we transform as follows: + + coord, rescale_factor = adapt_coordtoimg(img, coord, dim_bbox_page) + + # When we leave transparency to true we only modify the pixels with + # all values equal to 255, i.e. white, leaving the rest as they are + # If filled is False, we just do a box surrounding + coord = np.resize(coord,[4]) + if isinstance(color_vec,str): + c_aux = Color(color_vec) + color_vec = np.uint8(np.array(c_aux.rgb)*255) + elif isinstance(color_vec[0], float): + color_vec = np.uint8(np.array(color_vec)*255) + + aux_img = np.copy(img) + if filled: + if alpha: + aux_img[coord[0]:coord[2],coord[1]:coord[3],:] = color_vec + img = np.minimum(img, aux_img) + else: + img[coord[0]:coord[2],coord[1]:coord[3],:] = color_vec + else: + if alpha: + aux_img = lines_box(img, coord, color_vec, thick_line) + img = np.minimum(img, aux_img) + else: + img = lines_box(img, coord, color_vec, thick_line) + return img + + +def plot_labelled_boxes(img,label_textlines, list_allcoords_textlines, thick_l = 6): + + img_boxes = np.copy(img) + + for ind_t in label_textlines['footnote']: + img_boxes = lines_box(img_boxes, list_allcoords_textlines[:,ind_t].astype(np.uint32), + color_vec = 'red', thick_line = thick_l) + for ind_t in label_textlines['header']: + img_boxes = lines_box(img_boxes, list_allcoords_textlines[:,ind_t].astype(np.uint32), + color_vec = 'blue', thick_line = thick_l) + for ind_t in label_textlines['header_singlecol']: + img_boxes = lines_box(img_boxes, list_allcoords_textlines[:,ind_t].astype(np.uint32), + color_vec = 'black', thick_line = thick_l) + for ind_t in label_textlines['pagen']: + img_boxes = lines_box(img_boxes, list_allcoords_textlines[:,ind_t].astype(np.uint32), + color_vec = 'green', thick_line = thick_l) + for ind_t in label_textlines['text_col1']: + img_boxes = lines_box(img_boxes, list_allcoords_textlines[:,ind_t].astype(np.uint32), + color_vec = 'magenta', thick_line = thick_l) + for ind_t in label_textlines['text_col2']: + img_boxes = lines_box(img_boxes, list_allcoords_textlines[:,ind_t].astype(np.uint32), + color_vec = 'cyan', thick_line = thick_l) + for ind_t in label_textlines['text_inheader']: + img_boxes = lines_box(img_boxes, list_allcoords_textlines[:,ind_t].astype(np.uint32), + color_vec = 'yellow', thick_line = thick_l) + for ind_t in label_textlines['notidentified']: + img_boxes = lines_box(img_boxes, list_allcoords_textlines[:,ind_t].astype(np.uint32), + color_vec = 'yellow', thick_line = thick_l) + return img_boxes + +def plot_margins(img, side_margins, topb_margins, gap_line = 2, thick_l = 6): + + img_marg = np.copy(img) + img_marg = lines_box(img_marg, np.array([0, side_margins[0]-gap_line, + img_marg.shape[0], side_margins[0]+gap_line]), + color_vec = 'green', thick_line = thick_l) + img_marg = lines_box(img_marg, np.array([0, side_margins[1]-gap_line, + img_marg.shape[0], side_margins[1]+gap_line]), + color_vec = 'green', thick_line = thick_l) + + img_marg = lines_box(img_marg, np.array([topb_margins[0]-gap_line, 0, + topb_margins[0]+gap_line, img_marg.shape[1]]), + color_vec = 'green', thick_line = thick_l) + img_marg = lines_box(img_marg, np.array([topb_margins[1]-gap_line, 0, + topb_margins[1]+gap_line, img_marg.shape[1]]), + color_vec = 'green', thick_line = thick_l) + + return img_marg + +def plot_orderedtextl(img,set_of_blocks, n_colors = 255): + # set_of_blocks is a dictionary, where each key is a block, and then we have + # inside an array of dim 5xNumTextlines, where the textlines are ordered. + # The 5th element of this array indicates if the textline is instead a + # horizontal line that we don't have to plot + img_textl = np.copy(img) + color_def_long = plt.cm.hsv(range(255))[(np.linspace(0,254,n_colors)).astype(np.uint32),:3] + count_textl = 0 + for ind_d in range(len(set_of_blocks)): + all_el = set_of_blocks[ind_d] + for ind_c in range(all_el.shape[1]): + if all_el[4,ind_c] > -1: + img_textl = lines_box(img_textl, all_el[:4,ind_c].astype(np.uint32), + color_vec = (255 * color_def_long[count_textl,:]).astype(int), thick_line = 6) + count_textl += 1 + return img_textl + +def plot_horzvertlines(img, coord_horz, coord_vert_def): + # Horizontal green in red and vertical in green + img_lines = np.copy(img) + + for ind_h in range(coord_horz.shape[1]): + img_lines = lines_box(img_lines, coord_horz[:,ind_h].astype(np.uint32), + color_vec = 'red', thick_line = 6) + + img_lines = lines_box(img_lines, coord_vert_def.astype(np.uint32), + color_vec = 'green', thick_line = 6) + + return img_lines + +def plot_correctedXML(img, XML_enrich, bbox_page): + # Essentially plotting the corrected textboxes and the lines from the + # final xml + img_xml = np.copy(img) + for ind_el in range(0, len(XML_enrich[0])): + if XML_enrich[0][ind_el].tag == 'textbox': + if 'bbox' in XML_enrich[0][ind_el].attrib: + coord_textbox = np.array(XML_enrich[0][ind_el].attrib['bbox'].split(',')).astype(np.float64) + if XML_enrich[0][ind_el].attrib['type_textbox'] == 'line': + img_xml = highlight_text(img_xml, coord_textbox, bbox_page, color_vec = 'blue', alpha = True, filled = False, thick_line = 6) + if XML_enrich[0][ind_el].attrib['type_textbox'] == 'text': + img_xml = highlight_text(img_xml, coord_textbox, bbox_page, color_vec = 'red', alpha = True, filled = False, thick_line = 6) + return img_xml + +def plot_save_parallel(folder_pickles): + # Using files from pickle. Provides directly the folder with the pickles to + # convert. This was implemented for the sake of running things in parallel, + # which was not allowing to save the figs directly + + all_files = os.listdir(folder_pickles) + for filename in all_files: + if re.match("20[\w]*.pkl",filename): + full_filename = folder_pickles + filename + with open(full_filename,'rb') as f: # Python 3: open(..., 'rb') + all_images = pickle.load(f) + fig, axes = plt.subplots(1, len(all_images), figsize=(7.5 * len(all_images), 10)) + ax = axes.ravel() + for ind_image, im_plot in enumerate(all_images): + ax[ind_image].axis('off') + ax[ind_image].imshow(im_plot) + + format_fig = 'png' + name_fig = full_filename[:-3] + format_fig + fig.savefig(name_fig, format = format_fig, dpi = 200) + plt.close(fig) \ No newline at end of file diff --git a/src/python/preproc_docs.py b/src/python/preproc_docs.py new file mode 100644 index 00000000..8b4349f0 --- /dev/null +++ b/src/python/preproc_docs.py @@ -0,0 +1,853 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Sep 28 13:39:10 2018 + +@author: luissalamanca +""" + +# File for all the functions used for preprocessing. + +import numpy as np +import os +from plot_tools import adapt_coordtoimg, transform_coord_toorig +from pdf2image import convert_from_path, convert_from_bytes + +import copy + +from skimage.transform import (hough_line, hough_line_peaks, + probabilistic_hough_line) +from plot_tools import (lines_box, transform_coord, + highlight_text, plot_labelled_boxes, plot_margins, + plot_orderedtextl, plot_horzvertlines, plot_correctedXML) + +from skimage.feature import canny +from skimage.measure import label, regionprops +from skimage.morphology import remove_small_objects, dilation + +from sklearn.neighbors import BallTree + +from utils_proc import call_with_out, pdf2png, pdf2xml + +import pickle +import matplotlib.pyplot as plt + +import xml.etree.ElementTree as ET + +from math import pi, e +from functools import reduce + +from sys import getsizeof + +import time + +import tables + +HEIGHT_CHAR = 12 +WIDTH_CHAR = 6 + +def find_mainHorandCentral_Hough(img, coord, dim_bbox_page, + flag_2col, flag_central = 1): + # Using the coordinates of the boxes, we put the rest to 0, and then estimate + # the central line + # Here, since we use the image, we have to rely again on a ref00 in topleft, and + # the corners in topleftbottomright + # We also look for horizontal lines + # We assume that we will only have one vertical line, and then many horizontal + # lines, either spanning the whole image, or at both sides of the central line + + coord, rescale_factor = adapt_coordtoimg(img, coord, dim_bbox_page) + img_aux = np.abs(255 - img[:,:,0]) + img_aux[img_aux < 20] = 0 + img_aux[img_aux >= 20] = 255 + img_aux_in = np.copy(img_aux) + + + width_resc = WIDTH_CHAR * rescale_factor[0,1] + height_resc = HEIGHT_CHAR * rescale_factor[0,1] + gap_central = int(4 * width_resc) + top_bbox_red = 0 #int(height_resc/2) + + for ind in range(coord.shape[1]): + img_aux[(coord[0,ind] + top_bbox_red):coord[2,ind],coord[1,ind]:coord[3,ind]] = 0 + + # Also remove possible mark and artefacts in the edges + img_aux[:,:int(img_aux.shape[1]/20)] = 0 + img_aux[:int(img_aux.shape[0]/20),:] = 0 + img_aux[int(19 * img_aux.shape[0]/20):,:] = 0 + img_aux[:,int(19 * img_aux.shape[1]/20):] = 0 + + img_prev = np.copy(img_aux) + + img_aux_rem = remove_small_objects(label(img_aux), 2 * width_resc) + #img_aux = dilation(img_aux_rem, selem = np.ones((11,11))) + img_aux = dilation(img_aux_rem, selem = np.ones((5,5))) + max_val = np.max(img_aux) + if max_val > 0: + img_aux_norm = (255 * img_aux/max_val).astype(np.uint8) + img_aux[img_aux_norm < 1] = 0 + img_aux[img_aux_norm >= 1] = 255 + else: + img_aux[:] = 0 + #print(np.unique(img_aux)) + + # Remove big objects, like the shields and other logos + #img_label = label(img_aux) + + edges = canny(img_aux, 2, 1, 25) + #img_cent = np.copy(img_aux) + + if flag_2col: + if flag_central: + img_cent = np.copy(img_prev) + img_cent[:,0:int(2*img_aux.shape[1]/5)] = 0 + img_cent[:,int(3*img_aux.shape[1]/5):img_aux.shape[1]] = 0 + edges_cent = canny(img_cent, 2, 1, 25) + theta = np.linspace(-pi/8, pi/8,num = 90) + #theta = np.linspace(-pi/16, pi/16,num = 90) + #lines_vert = probabilistic_hough_line(edges_cent, theta = theta, line_length = 2 * width_resc, + # line_gap = width_resc) + lines_vert = probabilistic_hough_line(edges_cent, theta = theta, line_length = int(2 * width_resc), + line_gap = int(width_resc)) + else: + sum_img_aux_in = np.sum(img_aux_in, axis = 0) + sum_img_aux_in = sum_img_aux_in[int(2*img_aux.shape[1]/5):int(3*img_aux.shape[1]/5)] + + #plt.plot(sum_img_aux_in) + #sum_img_aux_in[sum_img_aux_in < np.max(sum_img_aux_in)/10] = 0 + # We need to substract the baseline value, in order to account for + # central headers and stuff like that + sum_img_aux_in = sum_img_aux_in - np.min(sum_img_aux_in) + #not_end_vect = 1 + #while not_end_vect: + ind_min_start = np.argwhere((sum_img_aux_in) < np.mean(sum_img_aux_in)/10) + ind_min_end = int(2*img_aux.shape[1]/5) + np.max(ind_min_start) + ind_min_start = int(2*img_aux.shape[1]/5) + np.min(ind_min_start) + ind_central = int((ind_min_start + ind_min_end)/2) + coord_vert_def = np.array([1, ind_central - int(width_resc/2), + img_aux_in.shape[0], ind_central + int(width_resc/2)]) + #print(lines_vert,img_aux.shape) + + theta = np.linspace(-5*pi/8, -3* pi/8,num = 90) + #theta = np.linspace(-9*pi/16, -7*pi/16,num = 90) + #lines_horz = probabilistic_hough_line(edges, theta = theta, line_length = 2 * width_resc, + # line_gap = width_resc) + lines_horz = probabilistic_hough_line(edges, theta = theta, line_length = int(2 * width_resc), + line_gap = int(width_resc)) + + # These lines are given in a standard xy coordinate, with the corner in the + # bottom left + lines_horz = np.transpose(np.asarray(lines_horz).reshape((len(lines_horz),4))) + + + lines_horz = np.concatenate((np.minimum(lines_horz[1,:],lines_horz[3,:]).reshape((1,lines_horz.shape[1])), + np.minimum(lines_horz[0,:],lines_horz[2,:]).reshape((1,lines_horz.shape[1])), + np.maximum(lines_horz[1,:],lines_horz[3,:]).reshape((1,lines_horz.shape[1])), + np.maximum(lines_horz[0,:],lines_horz[2,:]).reshape((1,lines_horz.shape[1])))).astype(np.int32) + if flag_central: + lines_vert = np.transpose(np.asarray(lines_vert).reshape((len(lines_vert),4))) + lines_vert = np.concatenate((np.minimum(lines_vert[1,:],lines_vert[3,:]).reshape((1,lines_vert.shape[1])), + np.minimum(lines_vert[0,:],lines_vert[2,:]).reshape((1,lines_vert.shape[1])), + np.maximum(lines_vert[1,:],lines_vert[3,:]).reshape((1,lines_vert.shape[1])), + np.maximum(lines_vert[0,:],lines_vert[2,:]).reshape((1,lines_vert.shape[1])))).astype(np.int32) + + + #lines_horz = transform_coord(lines_horz, dim_page = img_aux.shape, invert_xy = True) + #lines_vert = transform_coord(lines_vert, dim_page = img_aux.shape, invert_xy = True) + + # First clean the vertical from unexpected outliers + if flag_central: + sum_rows = np.sum(img_cent, axis = 0)/255 + ind_central = int(2*img.shape[1]/5) + np.argmax(sum_rows[int(2*img.shape[1]/5):int(3*img.shape[1]/5)]) + ind_valid = np.intersect1d(np.argwhere([(ind_central - gap_central) < aux_l1 < (ind_central + gap_central) for aux_l1 in lines_vert[1,:]]), + np.argwhere([(ind_central - gap_central) < aux_l2 < (ind_central + gap_central) for aux_l2 in lines_vert[3,:]])) + if len(ind_valid): + lines_vert = lines_vert[:,ind_valid] + coord_vert_def = np.array([np.min(lines_vert[0,:]), np.min(lines_vert[1,:]), + np.max(lines_vert[2,:]), np.max(lines_vert[3,:])]).astype(np.int32) + else: + coord_vert_def = np.array([0, img_aux.shape[1]/2 - width_resc, height_resc, img_aux.shape[1]/2 + width_resc]) + + #ind_central = np.mean(coord_vert_def[[1,3]]) + + # And now, just iterate over the horizontal lines, merging them if required. + return clean_horz_vert_lines(lines_horz, coord_vert_def, width_resc, height_resc, + ind_central, gap_central, img_aux.shape) + + +def clean_horz_vert_lines(coord_horz_pre, coord_vert_def, width_resc, height_resc, + ind_central, gap_central, dim_page): + # We just iterate over all the horizontal lines, merging them if required + coord_horz = np.array([]).reshape((4,0)).astype(np.int32) + min_length_line = 2 * width_resc + + while coord_horz_pre.size > 3: + if coord_horz_pre.shape[1] == 1: + coord_horz = np.concatenate((coord_horz, coord_horz_pre[:,0].reshape((4,1))), axis = 1) + coord_horz_pre = np.array([]) + else: + coord_horz_curr = coord_horz_pre[:,0] + #print(coord_horz_curr) + coord_horz_check = coord_horz_pre[:,1:] + flag_stay = 1 + while flag_stay: + # Boxes to the right + ind_val1 = np.intersect1d(np.argwhere((abs(coord_horz_check[1,:] - coord_horz_curr[3]) < (width_resc * 10))), + np.argwhere((abs(coord_horz_check[0,:] - coord_horz_curr[0]) < (height_resc)))) + # Boxes to the left + ind_val2 = np.intersect1d(np.argwhere((abs(coord_horz_check[3,:] - coord_horz_curr[1]) < (width_resc * 10))), + np.argwhere((abs(coord_horz_check[0,:] - coord_horz_curr[0]) < (height_resc)))) + + ind_val = np.unique(np.concatenate((ind_val1,ind_val2))) + if len(ind_val) > 0: + for i_b in range(len(ind_val)): + coord_horz_curr = np.array([np.min((coord_horz_curr[0],coord_horz_check[0,ind_val[i_b]])), + np.min((coord_horz_curr[1],coord_horz_check[1,ind_val[i_b]])), + np.max((coord_horz_curr[2],coord_horz_check[2,ind_val[i_b]])), + np.max((coord_horz_curr[3],coord_horz_check[3,ind_val[i_b]]))]) + coord_horz_check = coord_horz_check[:,np.setdiff1d(np.arange(coord_horz_check.shape[1]), + ind_val)] + #coord_horz_check = np.delete(coord_horz_check, ind_val, 1) + if coord_horz_check.shape[1] == 0: + flag_stay = 0 + coord_horz = np.concatenate((coord_horz, coord_horz_curr.reshape((4,1))), axis = 1) + coord_horz_pre = np.array([]) + else: + flag_stay = 0 + coord_horz = np.concatenate((coord_horz, coord_horz_curr.reshape((4,1))), axis = 1) + coord_horz_pre = coord_horz_check[:,:] + + # Remove overlapping boxes + coord_horz_def = np.array([]).reshape((4,0)) + while coord_horz.size > 3: + coord_horz_curr = coord_horz[:,0] + ind_overlap = reduce(np.intersect1d, (np.argwhere((coord_horz_curr[0] - width_resc/2) < coord_horz[0,:]), + np.argwhere((coord_horz_curr[1] - width_resc/2) < coord_horz[1,:]), + np.argwhere((width_resc/2 + coord_horz_curr[2]) > coord_horz[2,:]), + np.argwhere((width_resc/2 + coord_horz_curr[3]) > coord_horz[3,:]))) + ind_overlap = np.setdiff1d(ind_overlap,0) + + coord_horz_def = np.concatenate((coord_horz_def, coord_horz_curr.reshape((4,1))), axis = 1) + coord_horz = coord_horz[:,np.setdiff1d(np.arange(1,coord_horz.shape[1]),ind_overlap)] + #coord_horz = np.delete(coord_horz, ind_overlap, 1) + + if coord_horz.size == 4: + coord_horz_def = np.concatenate((coord_horz_def, coord_horz.reshape((4,1))), axis = 1) + coord_horz = np.array([0]) + + ind_val_long = np.argwhere((coord_horz_def[3,:] - coord_horz_def[1,:]) > (3 * (coord_horz_def[2,:] - coord_horz_def[0,:]))) + coord_horz_def = coord_horz_def[:,ind_val_long].reshape((4,ind_val_long.shape[0])) + + # Simply, remove too short lines that are likely artifacts + length_lines = coord_horz_def[3,:] - coord_horz_def[1,:] + ind_val = np.argwhere(length_lines > min_length_line) + if len(ind_val): + coord_horz_def = coord_horz_def[:, ind_val].reshape((4,ind_val.shape[0])) + else: + coord_horz_def = np.array([]).reshape((4,0)) + + + # To identify the topline + ''' + ind_topline = identify_topline(coord_horz_def, width_resc, dim_page) + if str_page == 'firsts': + # We correct the top of the vertical line in case it is cutting some of the horizontal lines + ind_val_horz = reduce(np.intersect1d, (np.argwhere(coord_horz_def[1,:] < (ind_central - gap_central)), + np.argwhere(coord_horz_def[3,:] > (ind_central + gap_central)), + np.argwhere(coord_horz_def[0,:] > coord_vert_def[0]))) + ind_val_horz = np.setdiff1d(ind_val_horz, ind_topline) + + coord_vert_def = np.array([np.max(np.concatenate((np.array([coord_vert_def[0]]),coord_horz_def[2,ind_val_horz]))),coord_vert_def[1], + coord_vert_def[2],coord_vert_def[3]]) + elif str_page == 'lasts': + # We correct the bottom of the vertical line in case it is cutting some of the horizontal lines + ind_val_horz = reduce(np.intersect1d, (np.argwhere(coord_horz_def[1,:] < (ind_central - gap_central)), + np.argwhere(coord_horz_def[3,:] > (ind_central + gap_central)), + np.argwhere(coord_horz_def[2,:] < coord_vert_def[2]))) + ind_val_horz = np.setdiff1d(ind_val_horz, ind_topline) + + coord_vert_def = np.array([coord_vert_def[0],coord_vert_def[1], + np.min(np.concatenate((np.array([coord_vert_def[2]]),coord_horz_def[0,ind_val_horz]))),coord_vert_def[3]]) + ''' + + coord_vert_def[1] = np.max((coord_vert_def[1], int(ind_central - width_resc))) + coord_vert_def[3] = np.min((coord_vert_def[3], int(ind_central + width_resc))) + + # Finally, remove short central lines, likely artefacts of the calculation + # of the central vertical line + length_lines = coord_horz_def[3,:] - coord_horz_def[1,:] + ind_wrong = reduce(np.intersect1d, (np.argwhere(length_lines < 2* min_length_line), + np.argwhere(coord_horz_def[1,:] < coord_vert_def[3]), + np.argwhere(coord_horz_def[3,:] > coord_vert_def[1]))) + ind_val = np.setdiff1d(np.arange(coord_horz_def.shape[1]),ind_wrong) + if len(ind_val): + coord_horz_def = coord_horz_def[:, ind_val].reshape((4,ind_val.shape[0])) + else: + coord_horz_def = np.array([]).reshape((4,0)) + + return coord_vert_def, coord_horz_def + +def identify_topline(coord_horz, width_resc, dim_page): + # Two rules for identifying the top line + ind_topline = reduce(np.intersect1d, (np.argwhere(coord_horz[2,:] < dim_page[0]/8), + np.argwhere((coord_horz[3,:] - coord_horz[1,:]) > width_resc * 60))) + + return ind_topline + +def lateral_margins(img, dim_bbox_page, coord_vert, coord_horz): + + coord, rescale_factor = adapt_coordtoimg(img, dim_bbox_page, dim_bbox_page) + width_resc = WIDTH_CHAR * rescale_factor[0,1] + gap_central = int(3 * width_resc) + thres_margin = 0.1 + + img_aux = np.abs(255 - img[:,:,0]) + for ind in range(coord_horz.shape[1]): + img_aux[coord_horz[0,ind]:coord_horz[2,ind],coord_horz[1,ind]:coord_horz[3,ind]] = 0 + + img_aux[coord_vert[0]:coord_vert[2],coord_vert[1]:coord_vert[3]] = 0 + central_line = (coord_vert[1] + coord_vert[3])/2 + + # Also remove possible mark and artefacts in the edges + img_aux[:,:gap_central] = 0 + img_aux[:int(gap_central/2),:] = 0 + img_aux[(img_aux.shape[1] - gap_central):,:] = 0 + img_aux[:,(img_aux.shape[1] - int(gap_central/2)):] = 0 + + sum_imarray_aux = np.sum(img_aux, axis = 0) + sum_imarray_aux = 1000*sum_imarray_aux.astype(np.float64)/np.max(sum_imarray_aux) + mean_val_rows_left = np.mean(sum_imarray_aux[:int(central_line - gap_central)]) + mean_val_rows_right = np.mean(sum_imarray_aux[int(central_line + gap_central):]) + + left_margin = np.min(np.argwhere(sum_imarray_aux > thres_margin * mean_val_rows_left)) + right_margin = np.max(np.argwhere(sum_imarray_aux > thres_margin * mean_val_rows_right)) + + return left_margin, right_margin, left_margin/rescale_factor[0,1], right_margin/rescale_factor[0,1] + +def bottomtop_margins(img, dim_bbox_page, coord_vert, coord_horz): + + val_thres = 300 # In this case we don't use the mean of sum_cols because we have + + coord, rescale_factor = adapt_coordtoimg(img, dim_bbox_page, dim_bbox_page) + img_aux = np.abs(255 - img[:,:,0]) + + height_resc = HEIGHT_CHAR * rescale_factor[0,1] + width_resc = WIDTH_CHAR * rescale_factor[0,1] + gap_central = int(3 * width_resc) + + for ind in range(coord_horz.shape[1]): + img_aux[coord_horz[0,ind]:coord_horz[2,ind],coord_horz[1,ind]:coord_horz[3,ind]] = 0 + + img_aux[coord_vert[0]:coord_vert[2],coord_vert[1]:coord_vert[3]] = 0 + + sum_cols = np.sum(img_aux, axis = 1)/255 + sum_cols = 1000 * sum_cols/np.max(sum_cols) + + # Now, limit by using the horizontal lines + ind_topline = identify_topline(coord_horz, width_resc, img_aux.shape) + + if len(ind_topline) > 0: + ind_min_textbox = np.max(coord_horz[2,ind_topline]) + sum_cols[:ind_min_textbox] = 0 + + #plt.figure() + #plt.plot(sum_cols) + ind_limits = np.array([np.min(np.argwhere(sum_cols > val_thres)), + np.max(np.argwhere(sum_cols > val_thres))]) + + return ind_limits + +def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_font_sizes): + + # In xml_page the levels are: xml_page[i][j][k], i for blocks, j for textlines + # and k for characters + + coord, rescale_factor = adapt_coordtoimg(img, bbox_page, bbox_page) + list_coords_blocks = np.transpose(np.array([o.attrib['bbox'].split(',') for o in xml_page[:-2]]).astype(np.float64)) + list_coords_blocks, rescale_factor = adapt_coordtoimg(img, list_coords_blocks, bbox_page) + + font_main_block = info_font_sizes[0, np.argmax(info_font_sizes[1,:])] + thres_font = font_main_block/5 # To compensate for error in the fontsize between columns + width_resc = WIDTH_CHAR * rescale_factor[0,1] + height_resc = HEIGHT_CHAR * rescale_factor[0,1] + gap_central = int(2 * width_resc) + indentation = int(4 * width_resc) + + ind_central = (coord_vert_def[3] + coord_vert_def[1])/2 + + # First pass just to discover main blocks + list_col1 = list() + list_col2 = list() + list_pagen = list() + list_textinheader = list() + all_mean_heights = np.array([]).reshape((1,0)) + list_allcoords_textlines = np.array([]).reshape((4,0)) + relative_ref_textline = np.array([], dtype = np.uint32).reshape((3,0)) + + count_text = 0 + + for ind_block in range(len(xml_page)-2): + xml_block = xml_page[ind_block] + list_coords_textline = np.transpose(np.array([o.attrib['bbox'].split(',') for o in xml_block[:] + if 'bbox' in o.attrib]).astype(np.float64)) + #list_coords_textline = np.transpose(np.array([o.attrib['bbox'].split(',') for o in xml_block[:]]).astype(np.float64)) + if len(list_coords_textline)>3: + list_coords_textline + list_coords_textline, rescale_factor = adapt_coordtoimg(img, list_coords_textline, bbox_page) + list_allcoords_textlines = np.concatenate((list_allcoords_textlines, list_coords_textline), axis = 1) + relative_ref_textline_aux = np.zeros((3,list_coords_textline.shape[1])) + + relative_ref_textline_aux[0,:] = count_text + np.arange(list_coords_textline.shape[1]) + relative_ref_textline_aux[1,:] = ind_block + relative_ref_textline_aux[2,:] = np.arange(list_coords_textline.shape[1]) + relative_ref_textline = np.concatenate((relative_ref_textline,relative_ref_textline_aux.astype(np.uint32)), axis = 1) + + for ind_textl in range(list_coords_textline.shape[1]): + all_heights = np.array([]) + xml_textline = xml_block[ind_textl] + if xml_textline.tag == 'textline': + bbox_textline = list_coords_textline[:,ind_textl] + for xml_text in xml_textline[:]: + if 'size' in xml_text.attrib: + all_heights = np.append(all_heights, float(xml_text.attrib['size'])) + #fontsize = fontsize_fromtextline(img[bbox_textline[0]:bbox_textline[2], + # bbox_textline[1]:bbox_textline[3],0]) + + fontsize = np.average(all_heights) + all_mean_heights = np.append(all_mean_heights, fontsize) + + # Normal font + #if ((font_main_block - thres_font) < mean_height < (font_main_block + thres_font)): + if ((font_main_block - thres_font) < fontsize < (font_main_block + thres_font)): + # Left side of the central line + # Centered + if ((bbox_textline[1] < ind_central) and (bbox_textline[3] > ind_central)): + # Short, just a few numbers + if len(xml_block[0]) < 12: + list_pagen.append(count_text) + else: + list_textinheader.append(count_text) + elif ((bbox_textline[1] > (margins[0] - indentation)) and (bbox_textline[3] < (ind_central + gap_central))): + list_col1.append(count_text) + # Right side of the central line + elif ((bbox_textline[1] > (ind_central - indentation)) and (bbox_textline[3] < (margins[1] + gap_central))): + list_col2.append(count_text) + count_text += 1 + + discovered_blocks = np.concatenate((np.array(list_col1),np.array(list_col2), + np.array(list_pagen),np.array(list_textinheader))) + blocks_left = np.setdiff1d(np.arange(list_allcoords_textlines.shape[1]),discovered_blocks) + + if len(list_col1): + bbox_col1 = np.array([np.min(list_allcoords_textlines[0,list_col1]), + np.min(list_allcoords_textlines[1,list_col1]), + np.max(list_allcoords_textlines[2,list_col1]), + np.max(list_allcoords_textlines[3,list_col1])]) + else: + bbox_col1 = np.array([0,0,10,10]) # Dummy value + + if len(list_col2): + bbox_col2 = np.array([np.min(list_allcoords_textlines[0,list_col2]), + np.min(list_allcoords_textlines[1,list_col2]), + np.max(list_allcoords_textlines[2,list_col2]), + np.max(list_allcoords_textlines[3,list_col2])]) + else: + bbox_col2 = np.array([0,0,10,10]) # Dummy value + + list_header = list() + list_header_singlecol = list() + list_footnote = list() + list_notidentified = list() + for ind_textline in blocks_left: + xml_textline = xml_page[relative_ref_textline[1,ind_textline]][relative_ref_textline[2,ind_textline]] + if xml_textline.tag == 'textline': + bbox_textline = list_allcoords_textlines[:,ind_textline] + # Small fontsize and below current bboxes of main blocks + if ((all_mean_heights[ind_textline] < (font_main_block - thres_font)) and + (bbox_textline[2] > bbox_col1[2]) and (bbox_textline[2] > bbox_col2[2])): + list_footnote.append(ind_textline) + # Large fontsizes + elif (all_mean_heights[ind_textline] > (font_main_block - thres_font)): + # Centered + if ((bbox_textline[1] < ind_central) and (bbox_textline[3] > ind_central)): + list_header.append(ind_textline) + # To the left or right of the central line + elif (((bbox_textline[1] > (margins[0] - indentation)) and (bbox_textline[3] < (ind_central + gap_central))) or + ((bbox_textline[1] > (ind_central - indentation)) and (bbox_textline[3] < (margins[1] + gap_central)))): + list_header_singlecol.append(ind_textline) + # Standard fontsize + elif ((font_main_block - thres_font) < all_mean_heights[ind_textline] < (font_main_block + thres_font)): + # Contained into the bbox of left column + if (((bbox_col1[0] - height_resc) < bbox_textline[0]) and ((bbox_col1[1] - width_resc) < bbox_textline[1]) + and ((bbox_col1[2] + height_resc) > bbox_textline[2]) and ((bbox_col1[3] + width_resc) > bbox_textline[3])): + list_col1.append(ind_textline) + # Contained into the bbox of the right column + elif (((bbox_col2[0] - height_resc) < bbox_textline[0]) and ((bbox_col2[1] - width_resc) < bbox_textline[1]) + and ((bbox_col2[2] + height_resc) > bbox_textline[2]) and ((bbox_col2[3] + width_resc) > bbox_textline[3])): + list_col2.append(ind_textline) + else: + list_notidentified.append(ind_textline) + + label_textlines = dict() + label_textlines['text_col1'] = list_col1 + label_textlines['text_col2'] = list_col2 + label_textlines['footnote'] = list_footnote + label_textlines['pagen'] = list_pagen + label_textlines['text_inheader'] = list_textinheader + label_textlines['header'] = list_header + label_textlines['header_singlecol'] = list_header_singlecol + label_textlines['notidentified'] = list_notidentified + + vec_labels_textline = np.zeros(list_allcoords_textlines.shape[1]).astype(np.str) + vec_labels_textline[list_col1] = 'text_col1' + vec_labels_textline[list_col2] = 'text_col2' + vec_labels_textline[list_footnote] = 'footnote' + vec_labels_textline[list_pagen] = 'pagen' + vec_labels_textline[list_textinheader] = 'text_inheader' + vec_labels_textline[list_header] = 'header' + vec_labels_textline[list_header_singlecol] = 'header_singlecol' + vec_labels_textline[list_notidentified] = 'notidentified' + + # relative_ref_textline: three rows with the following, the aboslute reference + # for the textline, the number of the block, and the number of the textline inside + # that block + return label_textlines, list_allcoords_textlines, relative_ref_textline, all_mean_heights, vec_labels_textline + + +def order_textl(rescale_factor, coord_vert_def, coord_horz, list_allcoords_textlines, margins): + # Two steps, first ordering the textlines, grouping them in big blocks separated + # by horizontal lines. Then, inside these groups, we group them in textboxes, + # incorporating this to the XML + height_resc = HEIGHT_CHAR * rescale_factor[0,1] + widht_resc = WIDTH_CHAR * rescale_factor[0,1] + + gap_central = 3 * widht_resc + gap_row = height_resc/2 + + # This parameters is intended for removing artefacts such as small dots in the + # text. But we have to be careful, as we can remove valuable characters. + # I first set a value of 3 * width_resc/4 + min_width_textl = 6 * widht_resc/4 + + central_line = (coord_vert_def[3] + coord_vert_def[1])/2 + array_coords_textl = np.concatenate((list_allcoords_textlines[:,:], + np.arange(list_allcoords_textlines.shape[1]).reshape((1,list_allcoords_textlines.shape[1])))) + + # Clean from to thin lines, thatn are just probably artefacts + all_widths = array_coords_textl[3,:] - array_coords_textl[1,:] + ind_valid = np.argwhere(all_widths > min_width_textl) + array_coords_textl = array_coords_textl[:,ind_valid].reshape((5,len(ind_valid))) + + ind_centralines = np.intersect1d(np.argwhere(coord_horz[1,:] < (central_line - gap_central)), + np.argwhere(coord_horz[3,:] > (central_line + gap_central))) + ind_sepfootnotes = np.intersect1d(np.argwhere(coord_horz[1,:] < (margins[0] + 2 * widht_resc)), + np.argwhere(coord_horz[3,:] < (central_line - gap_central))) + ind_centralines = np.union1d(ind_centralines,ind_sepfootnotes) + ind_collines = np.setdiff1d(np.arange(coord_horz.shape[1]),ind_centralines) + + array_coords_centrall = coord_horz[:,ind_centralines] + array_coords_coll = coord_horz[:,ind_collines] + array_coords_coll = np.concatenate((array_coords_coll, + -1 * np.ones(array_coords_coll.shape[1]).reshape((1,array_coords_coll.shape[1])))) + + not_visited = 1 + toprow = 0 + count_b = 0 + set_of_blocks = dict() + array_coords_centrall_ord = np.array([]).reshape((4,0)) + while not_visited: + + if array_coords_centrall.size > 3: + bottomrow = np.min(array_coords_centrall[0,:]) + ind_bottomrow = np.argmin(array_coords_centrall[0,:]) + array_coords_centrall_ord = np.concatenate((array_coords_centrall_ord, + array_coords_centrall[:,ind_bottomrow].reshape((4,1))), axis = 1) + array_coords_centrall = np.delete(array_coords_centrall,ind_bottomrow,1) + else: + bottomrow = 10000 + not_visited = 0 + ind_textl_proc = np.intersect1d(np.argwhere((array_coords_textl[2,:] - gap_row) >= toprow), + np.argwhere((array_coords_textl[2,:] - gap_row) < bottomrow)) + ind_lines_proc = np.intersect1d(np.argwhere(array_coords_coll[2,:] > toprow), + np.argwhere(array_coords_coll[0,:] < bottomrow)) + coord_cat = np.concatenate((array_coords_textl[:,ind_textl_proc].reshape(5,len(ind_textl_proc)), + array_coords_coll[:,ind_lines_proc].reshape(5,len(ind_lines_proc))), + axis = 1) + + if coord_cat.size > 0: + flag_col = 1 + ind_currcord = topmost_left_box(coord_cat, gap_row, max_col = central_line) + if ind_currcord == -1: + ind_currcord = topmost_left_box(coord_cat, gap_row, min_col = central_line) + flag_col = 2 + + order_coords = np.array([]).reshape(5,0) + while coord_cat.size > 4: + order_coords = np.concatenate((order_coords,coord_cat[:,ind_currcord].reshape(5,1)), axis = 1) + curr_coord = coord_cat[:,ind_currcord] + coord_cat = np.delete(coord_cat,ind_currcord,1) + if coord_cat.size > 4: + if flag_col == 1: + ind_currcord = next_textline_samerow(coord_cat, gap_row, curr_coord, max_col = central_line) + + if ind_currcord == -1: + ind_currcord = next_textline_samecol(coord_cat, gap_row, max_col = central_line) + + if ind_currcord == -1 : + ind_currcord = topmost_left_box(coord_cat, gap_row, min_col = central_line) + flag_col = 2 + + elif flag_col == 2: + ind_currcord = next_textline_samerow(coord_cat, gap_row, curr_coord, min_col = central_line) + + if ind_currcord == -1: + ind_currcord = next_textline_samecol(coord_cat, gap_row, min_col = central_line) + + if ind_currcord == -1 : + flag_col = 1 + ind_currcord = 0 + + else: + order_coords = np.array([]).reshape(5,0) + + toprow = np.copy(bottomrow) + set_of_blocks[count_b] = order_coords + count_b += 1 + + return set_of_blocks, array_coords_centrall_ord + +def topmost_left_box(coords, gap_row, min_col = 0, max_col = 10000): + # Returns the index of the box with topleft corner most to the top and the left + ind_valid = np.intersect1d(np.argwhere(coords[1,:] < max_col), + np.argwhere(coords[1,:] > min_col)) + if len(ind_valid): + min_row = np.min(coords[2,ind_valid]) + min_row = np.intersect1d(np.argwhere(coords[2,ind_valid] > (min_row - gap_row)), + np.argwhere(coords[2,ind_valid] < (min_row + gap_row))) + ind_valid_min = ind_valid[min_row] + curr_ind = ind_valid_min[np.argmin(coords[1,ind_valid_min])] + return curr_ind + else: + return -1 + +def next_textline_samerow(coords, gap_row, curr_coord, min_col = 0, max_col = 10000): + curr_row = curr_coord[2] + #ind_valid = np.intersect1d(np.argwhere(coords[1,:] < max_col), + # np.argwhere(coords[3,:] > min_col)) + ind_valid = np.intersect1d(np.argwhere(coords[3,:] < (max_col + gap_row)), + np.argwhere(coords[1,:] > (min_col - gap_row))) + if len(ind_valid): + min_row = np.intersect1d(np.argwhere(coords[2,ind_valid] > (curr_row - gap_row)), + np.argwhere(coords[2,ind_valid] < (curr_row + gap_row))) + if len(min_row): + ind_valid_min = ind_valid[min_row] + ind_next_textl = ind_valid_min[np.argmin(coords[1,ind_valid_min])] + return ind_next_textl + else: + return -1 + else: + return -1 + +def next_textline_samecol(coords, gap_row, min_col = 0, max_col = 10000): + #print(coords, max_col, min_col) + #ind_valid = np.intersect1d(np.argwhere(coords[1,:] < max_col), + # np.argwhere(coords[3,:] > min_col)) + ind_valid = np.intersect1d(np.argwhere(coords[3,:] < (max_col + gap_row)), + np.argwhere(coords[1,:] > (min_col - gap_row))) + if len(ind_valid): + min_row = np.min(coords[2,ind_valid]) + min_row = np.intersect1d(np.argwhere(coords[2,ind_valid] > (min_row - gap_row)), + np.argwhere(coords[2,ind_valid] < (min_row + gap_row))) + ind_valid_min = ind_valid[min_row] + ind_next_textl = ind_valid_min[np.argmin(coords[1,ind_valid_min])] + return ind_next_textl + else: + return -1 + + +def group_textl_create_xml(xml_t, set_of_blocks, ref_textl, labels_textl, rescale_factor, + centrall_ord, ind_page, dim_img): + + height_resc = HEIGHT_CHAR * rescale_factor[0,1] + widht_resc = WIDTH_CHAR * rescale_factor[0,1] + + max_inrow_sep = 4 * widht_resc + max_incol_sep = 1 * height_resc + gap_row = height_resc/2 + similarity_fonts = 0.95 + indentation = 2 * widht_resc + + centrall_ord_trans = transform_coord_toorig(centrall_ord, dim_page = dim_img, invert_xy = True, + rescale = True, scale_fact = rescale_factor.reshape((2,1)), ref00 = 'topleft', + refCorners = 'topleftbottomright') + + # Start creating the xml + xml_e = [] + xml_e = ET.Element('pages') + page_el = ET.Element('page') + page_el.attrib['id'] = str(ind_page) + page_el.attrib['bbox'] = xml_t[0].attrib['bbox'] + page_el.attrib['rotate'] = '0' + xml_e.append(page_el) + + val_type_col1 = {'text_col1', 'notidentified', 'header_singlecol', 'text_inheader'} + val_type_col2 = {'text_col2', 'notidentified', 'header_singlecol', 'text_inheader'} + + + count_b = 0 + text_b = ET.SubElement(page_el, 'textbox') + text_b.attrib['id'] = str(count_b) + text_b.attrib['block'] = '0' + for ind_b in range(len(set_of_blocks)): + all_el = set_of_blocks[ind_b].astype(np.int64) + all_bbox = np.array([]).reshape((4,0)) + for ind_c in range(all_el.shape[1]): + curr_el = all_el[:,ind_c] + flag_copy_textb = 1 + # If it is a textline with text + if curr_el[4] > -1: + all_bbox = np.concatenate((all_bbox, curr_el[:4].reshape((4,1))), axis = 1) + ref_curr_el = ref_textl[1:,int(curr_el[4])] + text_l = xml_t[0][ref_curr_el[0]][ref_curr_el[1]] + type_textl = labels_textl[int(curr_el[4])] + text_l.attrib['type'] = type_textl + text_b.append(text_l) + type_textbox = 'text' + + # To check if it satisfies the conditions for being a new textbox + if ind_c < (all_el.shape[1] - 1): + next_el = all_el[:,ind_c + 1] + if next_el[4] > -1: + if (((type_textl in val_type_col1) and (labels_textl[int(next_el[4])] in val_type_col1)) + or ((type_textl in val_type_col2) and (labels_textl[int(next_el[4])] in val_type_col2)) + or (type_textl == labels_textl[int(next_el[4])])): + # Object to the right or beneath + if ((0 < (next_el[1] - curr_el[3]) < max_inrow_sep) and (abs(curr_el[2] - next_el[2]) < gap_row)): + flag_copy_textb = 0 + elif ((0 < (next_el[2] - curr_el[2]) < max_incol_sep) and (abs(next_el[2] - curr_el[2]) > gap_row)): + # Accounting for footnotes or other stuff + curr_fontsize = curr_el[3] - curr_el[1] + next_fontsize = next_el[3] - next_el[1] + if ((curr_fontsize - next_fontsize * similarity_fonts) < curr_fontsize < + (curr_fontsize + next_fontsize * similarity_fonts)): + # Finally, account for indentation + if ((np.min(all_bbox[1,:]) + indentation) > next_el[1]): + flag_copy_textb = 0 + + # Attributes and stuff in case we need to store as textbox + if flag_copy_textb: + bbox_text_b = np.array([np.min(all_bbox[0,:]),np.min(all_bbox[1,:]), + np.max(all_bbox[2,:]),np.max(all_bbox[3,:])]) + bbox_text_b = transform_coord_toorig(bbox_text_b.reshape((4,1)), dim_page = dim_img, invert_xy = True, + rescale = True, scale_fact = rescale_factor.reshape((2,1)), ref00 = 'topleft', + refCorners = 'topleftbottomright') + all_bbox = np.array([]).reshape((4,0)) + # Instead, if we have a line + else: + bbox_text_b = curr_el[:4] + text_l = ET.SubElement(text_b, 'textline') + text_l.attrib['type'] = 'col_lines' + bbox_text_b = transform_coord_toorig(bbox_text_b.reshape((4,1)), dim_page = dim_img, invert_xy = True, + rescale = True, scale_fact = rescale_factor.reshape((2,1)), ref00 = 'topleft', + refCorners = 'topleftbottomright') + text_l.attrib['bbox'] = np.array2string(bbox_text_b[:].reshape((1,4)), precision = 3, separator = ',')[2:-2] + type_textbox = 'line' + + # Creating the new textbox + if flag_copy_textb: + text_b.attrib['bbox'] = np.array2string(bbox_text_b[:].reshape((1,4)), precision = 3, separator = ',')[2:-2] + text_b.attrib['type_textbox'] = type_textbox + count_b += 1 + if (ind_b == (len(set_of_blocks) - 1)) and (ind_c == (all_el.shape[1] - 1)): + pass + else: + text_b = ET.SubElement(page_el, 'textbox') + text_b.attrib['id'] = str(count_b) + text_b.attrib['block'] = str(ind_b) + all_bbox = np.array([]).reshape((4,0)) + + if (ind_b < (len(set_of_blocks) - 1)): + text_l = ET.SubElement(text_b, 'textline') + text_l.attrib['type'] = 'central_lines' + bbox_text_bcent = centrall_ord_trans[:,ind_b].reshape((1,4)) + text_l.attrib['bbox'] = np.array2string(bbox_text_bcent[:], precision = 3, separator = ',')[2:-2] + text_b.attrib['bbox'] = np.array2string(bbox_text_bcent[:], precision = 3, separator = ',')[2:-2] + text_b.attrib['type_textbox'] = 'line' + count_b += 1 + text_b = ET.SubElement(page_el, 'textbox') + text_b.attrib['id'] = str(count_b) + text_b.attrib['block'] = str(ind_b) + all_bbox = np.array([]).reshape((4,0)) + + + # Just add the two final elements from the original xml + page_el.append(xml_t[0][-2]) # Figure + page_el.append(xml_t[0][-2]) # Layout + + return xml_e + + +def get_text_onefile(XML_root, flag_all = 1, valid_types = ('text_col1', 'text_col2')): + + # helper function to clean text + # !!! so far only removing new lines and primitive dehyphenation + def clean_text(text): + # replace newline + text = text.replace('\n', ' ') + + # account for hyphenation (not completely correct...) + text = text.replace('- ', '') + + return text + + # initialize textbox count and empty dictionary + + XML_new = copy.deepcopy(XML_root) + + # for every page + + for ind_p, page in enumerate(XML_root): + #print(page.tag, page.attrib) + # for every textbox on that page + + for ind_t, textbox in enumerate(page): + if (textbox.tag == 'textbox'): + if 'type_textbox' in textbox.attrib.keys(): + if (textbox.attrib['type_textbox'] == 'text'): + + # initialize string + + #print(textbox.tag, textbox.attrib) + # for every textline in that textbox + for ind_tl, textline in enumerate(textbox): + prev_fontsize = 0 + prev_fonttype = 'Def' + complete_text = '' + flag_in = 0 + if textline.tag == 'textline': + #print(textline.tag, textline.attrib) + # for every text (actually just a letter) + + for ind_ch, text in enumerate(textline): + #print(ind_ch, text.text, len(textline), len(XML_new[ind_p][ind_t][ind_tl])) + # extend string + if 'font' in text.attrib.keys(): + if (text.attrib['font'] != prev_fonttype) or (text.attrib['size'] != str(prev_fontsize)): + if flag_in: + complete_text += '[/font]' + else: + flag_in = 1 + complete_text += '[font face="' + text.attrib['size'] + '" size="' + text.attrib['font'] + '"]' + prev_fontsize = text.attrib['size'] + prev_fonttype = text.attrib['font'] + complete_text = complete_text + text.text + child_new = XML_new[ind_p][ind_t][ind_tl][0] # Because we are removing elements + XML_new[ind_p][ind_t][ind_tl].remove(child_new) + # clean text + complete_text += '[/font]' + complete_text = clean_text(complete_text) + XML_new[ind_p][ind_t][ind_tl].text = complete_text + + + return XML_new + \ No newline at end of file diff --git a/src/python/run_correct_meta.py b/src/python/run_correct_meta.py new file mode 100644 index 00000000..6fa74294 --- /dev/null +++ b/src/python/run_correct_meta.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Nov 26 09:58:05 2018 + +@author: luissalamanca +""" + +# Code to run the extraction of the XML files from the original pdfs + +import os, sys + +#sys.path.append('../src/python/') + +import def_classes as defc +import utils_proc + +import numpy as np +import time + + +# The input parameters are simply the year and the folder of the database +input_file = sys.argv[1] +output_file = sys.argv[2] + +# Name tar out +name_tar_out = output_file.split('/')[-1].split('.tar.gz')[0] + +# +year_tocomp = input_file.split('/')[-2] +folder_database = input_file.split(year_tocomp)[0] + +# Start of computation +t1 = time.time() + +name_tar_file = input_file.split('/')[-1].split('.tar.gz')[0] +files_proc, _ = utils_proc.get_list(year_tocomp, folder_database, name_tar_file) + +list_proc = list() +for infile in files_proc: + + # 8 is always the length of the id code + infile_aux = year_tocomp + '/' + infile.split('/')[-1][:8] + '.pdf' + if infile_aux not in list_proc: + list_proc.append(infile_aux) + print(infile_aux) + d1 = defc.Document(infile_aux, folder_database) + try : + d1.meta_correct(name_outmeta = name_tar_out) + print('Meta corrected %s' % infile) + except: + print("Meta to correct %s prompted an error" % infile) + +print('Total time for correcting meta of year %d: %f' % (int(year_tocomp) ,(time.time() - t1))) diff --git a/src/python/run_correctxml.py b/src/python/run_correctxml.py new file mode 100644 index 00000000..87e74b7b --- /dev/null +++ b/src/python/run_correctxml.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Nov 26 09:58:05 2018 + +@author: luissalamanca +""" + +# Code to run the extraction of the XML files from the original pdfs + +import os, sys + +sys.path.append('../src/python/') + +import def_classes as defc +import utils_proc + +import numpy as np +import time + + +# The input parameters are simply the year and the folder of the database +input_file = sys.argv[1] +output_file = sys.argv[2] + +# Name tar out +name_tar_out = output_file.split('/')[-1].split('.tar.gz')[0] + +# +year_tocomp = input_file.split('/')[-2] +folder_database = input_file.split(year_tocomp)[0] + +# Start of computation +t1 = time.time() + +name_tar_file = input_file.split('/')[-1].split('.tar.gz')[0] +files_proc, _ = utils_proc.get_list(year_tocomp, folder_database, name_tar_file) + +list_proc = list() +for infile in files_proc: + + # 8 is always the length of the id code + infile_aux = year_tocomp + '/' + infile.split('/')[-1][:8] + '.pdf' + if infile_aux not in list_proc: + list_proc.append(infile_aux) + d1 = defc.Document(infile_aux, folder_database) + try : + d1.correct_xml(flag_plots = 0, flag_parallel = 0, flag_save_figs = 0, name_outxml = name_tar_file, + name_outcorrxml = name_tar_out) + #print('Corrected %s' % infile) + except: + print("File to correct %s prompted an error" % infile) + +print('Total time for correction of year %d: %f' % (int(year_tocomp) ,(time.time() - t1))) \ No newline at end of file diff --git a/src/python/run_extract_discussions.py b/src/python/run_extract_discussions.py new file mode 100644 index 00000000..2a5821f5 --- /dev/null +++ b/src/python/run_extract_discussions.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import os +import pickle +import re +import pandas as pd +from nltk.corpus import stopwords +import time + +import hf_extractdiscussions as hf + +# specify input values +years = [1891, 1995] +range_years = range(years[0], years[1] + 1) + +# paths +#path_start = '/home/lili/NLP_DemocraSci/nlp-democracy/' +path_data = '/data/complete_data/AB/' +path_output = '/data/output/' + +# open dictionary of last names from pickle file +with open('/home/lili/nlp-democracy/output/MPs/MPs_lastnames.pickle', 'rb') as f: + dict_lastnames = pickle.load(f) + +# open dictionary of overlaps +with open('/data/complete_data/Results_overlap/DictOverlap1891to1930.pkl', 'rb') as f: + dict_overlaps_1 = pickle.load(f) +with open('/data/complete_data/Results_overlap/DictOverlap1931to1995.pkl', 'rb') as f: + dict_overlaps_2 = pickle.load(f) +with open('/data/complete_data/Results_overlap/DictOverlap1991to1995.pkl', 'rb') as f: + dict_overlaps_3 = pickle.load(f) +dict_overlaps = {**dict_overlaps_1, **dict_overlaps_2, **dict_overlaps_3} +print(dict_overlaps.keys()) + +# get dictionary of discussions +# ----------------------------- + +start_time_discussions = time.time() + +# list of votation terms +list_votationterms = ['Abstimmung', 'Schlussabstimmung', 'Generalabstimmung', 'Angenommen', 'Abgelehnt', + 'Einverstanden', 'Stimmen', '(Eintreten)', '(Nichteintreten)', + 'Votation', 'Vote', 'votation', #'(AdoptÃs)', 'adoptÃs', 'adoptÃe', 'rejetÃe', + "D'accord", 'voix'] + +# list of stopwords +list_stopwords = stopwords.words('german') +list_stopwords.extend(['dass', 'resp', 'daran', 'dr', 'herr', 'herrn', 'hr']) +list_stopwords.extend(stopwords.words('french')) +list_stopwords.extend(['ils', 'les', 'celle']) + +print('start to identify discussions of the years', years, '\n\n\n') + +# initialize empty dictionary for all documents +dict_docs = {} + +# for every year +for year in range_years: + start_time = time.time() + + str_year = str(year) + print(year, '\n', 30*'=') + # initialize empty dictionary for that year + dict_year = {} + # extract list of numbers for that year + list_numbers = next(os.walk(path_data + str_year))[1]#os.listdir(path_data + str_year) + list_numbers.sort() + # extract list of lastnames for that year and generate dataframe from it + lists_lastnames = dict_lastnames[int(year)] + df_lastnames = hf.get_df_from_lists_names(lists_lastnames) + # extract overlaps of that year + dict_overlaps_year = dict_overlaps[year] + # for each number, i.e. document + for number in list_numbers: + path_number = path_data + str_year + '/' + number + '/' + # if document is a discussion + if (hf.check_if_discussion(path_number + number + '.xml')) and (number not in ['20032463', '20032952', '20014332']): + print(number + '\n') + # get dictionary with text + dict_text = hf.get_text_onefile(path_number + number + '_datacorr.xml') + # exclude parts from previous and next document + if number in dict_overlaps_year: + dict_text = hf.exclude_overlaps(dict_text, dict_overlaps_year[number]) + # get all discussionstarts + dict_discussionstarts = hf.get_discussion_starts(dict_text, df_lastnames, list_stopwords, bln_print=True) + # get votation paragraphs + dict_votations = hf.get_votations(dict_text, list_votationterms) + # put all discussions together in dictionary + dict_discussions, list_keys = hf.get_discussions(dict_text, dict_discussionstarts, dict_votations) + # save that discussions dictionary to the yearly dictionary + dict_year[number] = dict_discussions + + #print('\n\n') + # save that yearly dictionary to the dictionary for all documents + dict_docs[year] = dict_year + # dump that discussions dictionary in the yearly folder + path_year = path_output + 'AB/' + str_year + '/' + os.makedirs(path_year, exist_ok=True) + with open(path_year + 'dict_discussions.pickle', 'wb') as f: + pickle.dump(dict_year, f) + + print("Time to extract discussions for year %s: %s minutes\n" % (year, (time.time() - start_time)/60)) + + +# dump dictionary of documents to a pickle file +year_start = str(list(dict_docs.keys())[0]) +year_end = str(list(dict_docs.keys())[-1]) +with open(path_output + 'dict_discussions_' + year_start + '-' + year_end + '.pickle', 'wb') as f: + pickle.dump(dict_docs, f) + +print("Time to extract all discussions: %s minutes\n" % ((time.time() - start_time_discussions)/60)) + + +# Language identification with Luis' method +# ----------------------------------------- + +print('start to identify languages of the years', years, '\n\n\n') +start_time_languages = time.time() + + +# initialize empty dictionaries +dict_languages = {} +dict_german = {} +dict_french = {} +dict_italian = {} + +# for every year +for year in range_years: + str_year = str(year) + start_time = time.time() + + # initialize empty dictionaries for that year + dict_year = {} + dict_year_german = {} + dict_year_french = {} + dict_year_italian = {} + print(year) + + # load pickle dump for that year + with open(path_output + 'AB/' + str_year + '/dict_discussions.pickle', 'rb') as f: + dict_disc_year = pickle.load(f) + + # for every document in that year + for number in dict_disc_year: #dict_docs[year]: + # initiaze empty dictionaries for that document + dict_number = {} + dict_number_german = {} + dict_number_french = {} + dict_number_italian = {} + print(number) + + # tokenize discussion + dict_tokenized = hf.tokenize_dictionary(dict_disc_year[number], hf.tokenizer) + + # identify language + dict_lang = hf.identify_lang(dict_tokenized) + #print(dict_lang) + + # assign language + for tupel, value in dict_lang.items(): + #print(tupel) + lang = hf.label_language(value) + dict_number[tupel] = lang + if lang == 'german': + dict_number_german[tupel] = dict_disc_year[number][tupel] + elif lang == 'french': + dict_number_french[tupel] = dict_disc_year[number][tupel] + elif lang == 'italian': + dict_number_italian[tupel] = dict_disc_year[number][tupel] + else: + pass + #print(lang, value, dict_docs[year][number][tupel]) + + # add to dictionaries of that year + dict_year[number] = dict_number + dict_year_german[number] = dict_number_german + dict_year_french[number] = dict_number_french + dict_year_italian[number] = dict_number_italian + + # add to overall dictionaries + dict_languages[year] = dict_year + dict_german[year] = dict_year_german + dict_french[year] = dict_year_french + dict_italian[year] = dict_year_italian + + print("Time to identify languages for discussions of year %s: %s minutes\n" % (year, (time.time() - start_time)/60)) + + # dump for that year + with open(path_output + 'AB/' + str_year + '/dict_languages.pickle', 'wb') as f: + pickle.dump(dict_year, f) + with open(path_output + 'AB/' + str_year + '/dict_discussions_german.pickle', 'wb') as f: + pickle.dump(dict_year_german, f) + with open(path_output + 'AB/' + str_year + '/dict_discussions_french.pickle', 'wb') as f: + pickle.dump(dict_year_french, f) + with open(path_output + 'AB/' + str_year + '/dict_discussions_italian.pickle', 'wb') as f: + pickle.dump(dict_year_italian, f) + +print("Time to identify languages for all discussions: %s minutes\n" % ((time.time() - start_time_languages)/60)) diff --git a/src/python/run_extract_origxml.py b/src/python/run_extract_origxml.py new file mode 100644 index 00000000..8af12f7e --- /dev/null +++ b/src/python/run_extract_origxml.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Nov 26 09:58:05 2018 + +@author: luissalamanca +""" + +# Code to run the extraction of the XML files from the original pdfs + +import os, sys + +#sys.path.append('../src/python/') + +import def_classes as defc +import utils_proc + +import numpy as np +import time + + +# The input parameters are simply the year and the folder of the database +input_file = sys.argv[1] +output_file = sys.argv[2] + +# Name tar out +name_tar_out = output_file.split('/')[-1].split('.tar.gz')[0] + +# +year_tocomp = input_file.split('/')[-2] +folder_database = input_file.split(year_tocomp)[0] + +# Start of computation +t1 = time.time() + +name_tar_file = input_file.split('/')[-1].split('.tar.gz')[0] +files_proc, _ = utils_proc.get_list(year_tocomp, folder_database, name_tar_file) + +list_proc = list() +for infile in files_proc: + + # 8 is always the length of the id code + infile_aux = year_tocomp + '/' + infile.split('/')[-1][:8] + '.pdf' + if infile_aux not in list_proc: + list_proc.append(infile_aux) + d1 = defc.Document(infile_aux, folder_database) + try : + d1.pdf2xml(name_outxml = name_tar_out) + print('Extracted %s' % infile) + except: + print("File %s prompted an error" % infile) + +print('Total time for year %d: %f' % (int(year_tocomp) ,(time.time() - t1))) \ No newline at end of file diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py new file mode 100644 index 00000000..5ffd75d5 --- /dev/null +++ b/src/python/utils_annot.py @@ -0,0 +1,600 @@ +#!/usr/bin/env python3 + +import xml.etree.ElementTree as ET +from nltk.tokenize import RegexpTokenizer +from nltk.corpus import stopwords +from nltk.metrics import edit_distance +from pyxdameraulevenshtein import damerau_levenshtein_distance_ndarray, normalized_damerau_levenshtein_distance_ndarray +import numpy as np +import pandas as pd +import string +import re + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +# functions from hf_extractdiscussions.property +# ============================================================================== + +# function to exclude overlapping textboxes between documents +# input: +# - dict_text: dictionary of texts of one document +# - dict_overlaps_year: dictionary with overlaps +# output: +# - dict_text: modified dict_text +def exclude_overlaps(dict_text, dict_overlaps): + # initialize to impossible values + first_entry = -1 + last_entry = 1000 + + # get index of textbox from first and last page + # the overlap dictionary only contains an entry, if an overlap was detected + for entry, array in dict_overlaps.items(): + if entry == 'first': + first_entry = int(array[0]) + if entry == 'last': + last_entry = int(array[0]) + + # get list of keys for first and last page + list_first_page = [key for key in dict_text if key.split(',')[1] == '0'] + last_page = max([int(key.split(',')[1]) for key in dict_text]) + list_last_page = [key for key in dict_text if key.split(',')[1] == str(last_page)] + + # modify dict_text on first page + for key in list_first_page: + if int(key.split(',')[2]) < first_entry: + dict_text[key] = '' + + # ... and on last page + for key in list_last_page: + if int(key.split(',')[2]) > last_entry: + dict_text[key] = '' + + return dict_text + + +# tokenizer +tokenizer_canton = RegexpTokenizer(r'\w+') # only leaves words +#tokenizer = RegexpTokenizer(r'\w+(?:-\w+)*|\$[\d\.]+|\S+') +# last part \S+ is needed to get colon, \S stands for white space +tokenizer = RegexpTokenizer(r'\w+(?:-/w+)*|\$[\d\.]+') + +# function to extract discussion starts +# !!! maybe we only need a list of discussion starts +# input: +# - dict_text: dictionary with text of one file +# - list_names: list of MPs +# - list_stopwords: list of german and french stopwords +# - bln_print: whether to print during execution, default False +# output: +# - dict_discussionstarts: dictionary with discussion starts +def get_discussion_starts(dict_text, df_names, list_stopwords, bln_print=False): + + # initialize empty dictionary + dict_discussionstarts = {} + + # add a few terms to list_stopwords that are easily mistaken as last names + list_stopwords.extend(['art', 'rath', 'alinea', 'stimmen', 'stimme', 'hans', 'walter', 'werner', 'projet', 'stimmt', 'nicht', 'vote', 'pas', 'gallen', 'stgallen', + 'kasse', 'fasse', 'sitten', 'herren', 'herr', 'alter']) + + # for every textbox + for key, text in dict_text.items(): + if ':' in text[:100]: + # extract the index of the colon in the text + colon_index_text = text.index(':') + + # look at first few terms of that textbox + text_start = re.sub(r'[\(\)]','',text[:colon_index_text]) + print('text_start', text_start) + list_oi = tokenizer.tokenize(text_start) + print('asdf', list_oi) + + # shorten to part before colon + list_oi2 = list_oi + + # remove stopwords + list_oi2 = [term for term in list_oi2 if term.lower() not in list_stopwords] + + # remove punctuation + list_oi2 = [''.join(c for c in s if c not in string.punctuation) for s in list_oi2] + list_oi2 = [s for s in list_oi2 if s] + + # remove lower case terms +# list_oi2 = [term for term in list_oi2 if not term.islower()] + + # remove numbers + list_oi2 = [term for term in list_oi2 if not term.isdigit()] + + # remove single characters + list_oi2 = [term for term in list_oi2 if len(term)>1] + + # initialize string for name and role + str_name = '' + str_role = '' + int_uniqueID = int(0) + str_canton = '' + + # for every term, reversed finds canton before it finds name + for term in reversed(list_oi2): + # if possible, find a name in a list + str_name, str_role, int_uniqueID, str_canton = find_names(term, str_name, str_role, int_uniqueID, df_names, str_canton, bln_print=True) + print('name', str_name, 'role', str_role) + + # get rid of 'Präsident stimmt nicht Président ne vote pas' + if set(str_role.split()).intersection(set(['Präsident', 'Präsidentin', 'Président', 'Présidente'])) and not str_name: + print('++++++++++ Präsident', list_oi2, list_oi) + if set(['stimmt', 'nicht', 'vote', 'pas']).intersection(list_oi): + str_role = '' + + # get rid of 'Für den Antrag "Name" stimmen: Votent pour la proposition "Name":' + if str_name: + print('++++++++++ Name', list_oi2, list_oi) + if len(set(['Antrag', 'stimmen', 'Votent', 'proposition']).intersection(list_oi)) > 1: + str_name = '' + + # if a name has been found + if str_name or str_role: + # add text to dictionary + dict_discussionstarts[(key, str_name, str_role, int_uniqueID)] = (text[:colon_index_text], + text[colon_index_text+1:]) + if bln_print: + print('found a name:', list_oi2, str_name, str_role, '\n') + + return dict_discussionstarts + + + +# small function to get first item of tupels in a list +def get_first_item(list_tupels): + list_first_item = [tupel[0] for tupel in list_tupels] + return list_first_item + +# small function to get last two items of tupels in a list +def get_last_item(list_tupels): + list_last_item = [tupel[-2:] for tupel in list_tupels] + return list_last_item + + + +# function to find names +# input: +# - term: term that might be name +# - str_name: string to which name should be attached +# - str_role: string to which role should be attached +# - int_uniqueID: integer for uniqueID +# !!! (if there are several possibilities, this becomes a tuple) +# - list_tupels: list of tupels containing all types of names +def find_names(term, str_name, str_role, int_uniqueID, df_names, str_canton, bln_print=False): + + def get_string(term, str_name, str_role, int_uniqueID, str_canton): + name_type = '' + # if it is one of the simple names + if term in list(df_names['name_short'].loc[df_names['type']=='simple']): + str_name = add_to_string(str_name, term) + name_type = 'simple' + # if it is a role + elif term in list_roles: + str_role = add_to_string(str_role, term) + # if it is a double name + elif term in list(df_names['name_short'].loc[df_names['type']=='double']): + print(20*'\n', 'DOUBLE NAME') + # get correct name + correct_name = df_names.loc[(df_names['type']=='double') & (df_names['name_short']== term)].iat[0, df_names.columns.get_loc('name_correct')] + if bln_print: + print('double name', correct_name) + str_name = add_to_string(str_name, correct_name) + name_type = 'double' + # if it is a composite name + elif term in list(df_names['name_short'].loc[df_names['type']=='comp']): + # get correct name + correct_name = df_names.loc[(df_names['type']=='comp') & (df_names['name_short']== term)].iat[0, df_names.columns.get_loc('name_correct')] + if bln_print: + print('composite name', correct_name) + str_name = add_to_string(str_name, correct_name) + name_type = 'comp' + # if it contains a canton + # !!! also pass list_oi to look for canton + # !!! how to handle for people mentioned in text??? + elif term in list(df_names['name_short'].loc[df_names['type']=='canton']): + if bln_print: + print('contains a canton', term) +# canton_missing = False +# df_temp = df_names.loc[df_names['name_short']==term] +# print('list_correct', df_temp) +# print(str_canton) +# if str_canton: +# str_correct = check_place(df_temp, str_canton) +# if str_correct in ['not found', 'too many']: +# str_name = add_to_string(str_name, term + ' (CANTONT MISSING)') +# canton_missing = True +# else: +# str_name = add_to_string(str_name, str_temp) +# name_type = 'canton' + str_name = add_to_string(str_name, term + ' (CANTON MISSING)') + name_type = 'canton' + + + temp = '' + if name_type in ['simple', 'double', 'comp']: + temp = df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==term)].iat[0, df_names.columns.get_loc('uniqueIndex')] + elif name_type in ['canton']: + temp = tuple(df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==term)].iloc[:, df_names.columns.get_loc('uniqueIndex')]) +# if canton_missing: +# temp = tuple(df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==term)].iloc[:, df_names.columns.get_loc('uniqueIndex')]) +# else: +# temp = df_names.loc[(df_names['type']==name_type) & (df_names['name_correct']==str_correct)].iat[0, df_names.columns.get_loc('uniqueIndex')] + + if temp: + if int_uniqueID == 0: + int_uniqueID = temp + else: + int_uniqueID = (int_uniqueID, temp) + + return str_name, str_role, int_uniqueID + + # small function to add term to str_name + def add_to_string(string, term): + if not string: + string = term + else: + string += ' ' + term + return string + + list_roles = ['Präsident', 'Präsidentin', 'Vizepräsident', 'Präsidium', 'Président', 'Présidente', 'président', 'présidente', + 'Berichterstatter', 'Berichterstatterin', 'rapporteur', + 'Sprecher', 'Sprecherin', 'porte-parole', 'porteparole', + 'Bundesrat', 'Bundesrath', 'Bundesrätin', 'conseiller fédéral', + 'Vizepräsident'] + + list_notnames = ['Art', 'Rath', 'Alinea', 'Stimmen', 'Stimme', 'Hans', 'Walter', 'Werner', 'projet', 'stimmt', 'nicht', 'vote', 'pas', 'Gallen', 'StGallen', + 'Kasse', 'fasse', 'Sitten', 'Herren', 'Herr', 'Alter'] + + list_places = get_list_cantons(df_names) + + if bln_print: + print('now is about: ------', term) + # extract list and array of last names + list_all_names = list(df_names['name_short']) + array_all_names = np.array(df_names['name_short']) + + # if term is not easily mistaken as a name (avoid false positives) + if term not in list_notnames: + + # if term is in the list of all names and roles + if term in (list_all_names + list_roles): + # get correct name and uniqueID, or role, for that term + str_name, str_role, int_uniqueID = get_string(term, str_name, str_role, int_uniqueID, str_canton) + + if bln_print: + print('=== correct name', term) + # if term in list of cantons + elif term in list_places: + str_canton = term + # if term is not in list_all_names + else: + # look for similar names based on (normalized) Damerau-Levenshtein distance + # !!! probably need to improve this procedure + # - find better values .... + if bln_print: + print(term) + array_normalized = array_all_names[normalized_damerau_levenshtein_distance_ndarray(term, array_all_names) <= 0.35] + array_normalized_values = normalized_damerau_levenshtein_distance_ndarray(term, array_normalized) + if bln_print: + print(array_normalized, array_normalized_values) + array_absolute = array_all_names[damerau_levenshtein_distance_ndarray(term, array_all_names) <= 2] + array_absolute_values = damerau_levenshtein_distance_ndarray(term, array_absolute) + if bln_print: + print(array_absolute, array_absolute_values) + set_intersection = set(array_normalized).intersection(set(array_absolute)) + # check if a similar name was found + term_approx = '' + if len(set_intersection) == 1: + term_approx = list(set_intersection)[0] + if bln_print: + print('we found the name', set_intersection) + elif len(set_intersection) > 1: + # !!! we only look at normalized values + # !!! we don't account for names with same values !!! + array_min = array_normalized[array_normalized_values.argmin()] + term_approx = array_min#[0] + if bln_print: + print('we found several possible names', set_intersection, 'and choose', array_min) + if term_approx: + str_name, str_role, int_uniqueID = get_string(term_approx, str_name, str_role, int_uniqueID, str_canton) + print('*******************', str_name, term_approx) + + + return str_name, str_role, int_uniqueID, find_names + + + +# function to get data frame from lists of names +# input: +# - lists_names: lists of names (simple, double, comp, canton) +# output: +# - df: corresponding dataframe +def get_df_from_lists_names(lists_names): + list_types = ['simple', 'double', 'comp', 'canton'] + df = pd.DataFrame() + for i in range(4): + df_temp = pd.DataFrame(lists_names[i], + columns = ('name_short', 'name_correct', 'uniqueIndex', 'CantonName', 'CantonAbbreviation', 'Citizenship', 'FirstName')) + df_temp['type'] = list_types[i] + df = pd.concat([df, df_temp], ignore_index = True) + return df + + + + +# function to extract votation paragraphs +# !!! maybe we only need a list of votation paragraphs +# !!! error prone, possible improvements see notebook extract_discussions +# input: +# - dict_text: dictionary with text of one file +# - list_names: list of votation terms +# - bln_print: whether to print during execution, default False +# output: +# - dict_votations: dictionary with votations +def get_votations(dict_text, list_votationterms, bln_print=True): + count = 0 + dict_votations = {} + for key, text in dict_text.items(): + list_oi = tokenizer.tokenize(text)[:15] +# if len(set(list_oi).intersection(set(list_votationterms))) > 1: + if set(list_oi).intersection(set(list_votationterms)): + count += 1 + dict_votations[key] = text + if bln_print: + print(count, 'MATCH', key, list_oi) + else: + #pass + if bln_print: + print('----- ', list_oi) + + if bln_print: + print(count) + + return dict_votations + + + + +# function to put discussions together +# !!! needs improvement when OCRed xml is corrected (details see notebook) +# input: +# - dict_discussionstarts +# - dict_votations +# output: +# - dict_discussions: dictionary of discussion parts +# key: integer of discussion start +# value: text until next discussion start or votation paragraph +def get_discussions(dict_text, dict_discussionstarts, dict_votations): + + # helper function to add text to discussion dictionary + def add_to_dict(key, i): +# print(key, i) + if key not in dict_discussions: + dict_discussions[key] = dict_discussionstarts[key] + else: + if i in list_text_keys_integers: + actual_i = list(dict_text.keys())[list_text_keys_integers.index(i)] + only_text = dict_discussions[key][1] + dict_text[actual_i] + dict_discussions[key] = (dict_discussions[key][0], only_text) + list_keys.append(i) + + + # list of keys for discussion starts and votation paragraphs + list_discussionstarts = list(dict_discussionstarts.keys()) + list_discussionstarts_integers = [int(tpl[0].split(',')[0]) for tpl in dict_discussionstarts.keys()] + list_votations_strings = list(dict_votations.keys()) + list_votations_integers = [int(tpl.split(',')[0]) for tpl in list_votations_strings] + list_text_keys_integers = [int(tpl.split(',')[0]) for tpl in dict_text.keys()] + + # initialize empty dictionary for discussions and empty list for all added keys + dict_discussions = {} + list_keys = [] + + # if there are no discussion starts, return empty dictionary and list + if not list_discussionstarts: + return dict_discussions, list_keys + + # for every discussion start except last + for idx, key in enumerate(list_discussionstarts_integers[:-1]): + #print(idx, key) + # write discussion start to dictionary + add_to_dict(list_discussionstarts[idx], key) + + # for every textbox until next discussion start + for i in range(key + 1, list_discussionstarts_integers[idx + 1]): + # if it is not a votation paragraph, write it to dictionary, + if i not in list_votations_integers: + add_to_dict(list_discussionstarts[idx], i) + # else, stop execution of for loop + else: + break + + # for last discussion start + last_key = list_discussionstarts_integers[-1] + # write discussion start to dictionary + add_to_dict(list_discussionstarts[-1], last_key) + # for every textbox until the end of the document + for i in range(last_key + 1, max(list_text_keys_integers) + 1): + # if it is not a votation paragraph, write it to dictionary + if i not in list_votations_strings: + add_to_dict(list_discussionstarts[-1], i) + # else, stop execution of for loop + else: + break + + return dict_discussions, list_keys + + +# function to check whether a file containts discussions +# achieved by excluding title pages, table of content, etc. +# !!! function works well for 1891 - 1900, not checked after that !!! +def check_if_discussion(path_meta_xml_file, + list_attributes = ['TITEL_NORMAL_DE', 'TITEL_NORMAL_FR', 'TITEL_ORIGINAL_DE', 'TITEL_ORIGINAL_FR'], + list_nondiscussion = ['inhaltsverzeichnis', 'inhaltsverzeichniss', 'jahresinhalt', 'rednerliste', + 'umschlag', 'sachregister', 'titelblatt', 'numerierung'], + list_nondiscussion2 = ['table', 'matières', 'répertoire', 'procès-verbaux']): + # parse, get root and then part of interest + XML_tree = ET.parse(path_meta_xml_file) + XML_root = XML_tree.getroot() + XML_poi = XML_root[0].find('ADS_TEXTEINHEIT') + + # for each title attribute + for attribute in list_attributes: + # if xml contains this attribute + if attribute in XML_poi.attrib: + # get title and generate set with lower case terms + title = XML_poi.attrib[attribute] + set_title = set([term.lower() for term in title.split()]) + #print(set_title) + # if one of terms is in list_nondiscussion, return False + if set_title.intersection(set(list_nondiscussion)): + #print('NOOO', path_meta_xml_file) + return False + # if two terms are in list_nondiscussion2, also return False + if len(set_title.intersection(set(list_nondiscussion2))) > 1: + #print('NOOO', path_meta_xml_file) + return False + + return True + + + + + +def tokenize_dictionary(dictionary, tokenizer, only_text=False): + dictionary_tokenized = {} + # if there is only text, e.g. when we look at all texts of a document at once (level 2 in flattened dictionary) + if only_text: + for key, text in dictionary.items(): + dictionary_tokenized[key] = tokenizer.tokenize(text) + # if the values are actually tuples (speaker, text), e.g. when a document corresponds to what one person said + else: + for key, text in dictionary.items(): + dictionary_tokenized[key] = (text[0], tokenizer.tokenize(text[1])) + + return dictionary_tokenized + + + +# value of dictionary needs to be tokenized !!!! +def remove_stopwords_from_dictionary(dictionary, list_stopwords, only_text=False): + dict_docs_afterswr = {} + # if there is only text, e.g. when we look at all texts of a document at once (level 2 in flattened dictionary) + if only_text: + for doc, text in dictionary.items(): + list_text = text + list_words_tokenized = [word for word in list_text if word.lower() not in list_stopwords] + dict_docs_afterswr[doc] = ' '.join(list_words_tokenized) + # if the values are actually tuples (speaker, text), e.g. when a document corresponds to what one person said + else: + for doc, tupel in dictionary.items(): + before_colon = tupel[0] + list_text = tupel[1] + list_words_tokenized = [word for word in list_text if word.lower() not in list_stopwords] + dict_docs_afterswr[doc] = (before_colon, ' '.join(list_words_tokenized)) + + return dict_docs_afterswr + + + +def dict_only_text(dictionary): + dictionary_only_text = {} + for key, tupel in dictionary.items(): + dictionary_only_text[key] = tupel[1] + + return dictionary_only_text + + + + +# two functions for language identification +# Author: Luis Salamanca +# small modifications by Lili Gasser +# Using stopwords +def identify_lang(dict_text, valid_lang = ('german', 'french', 'italian')): + + language_ratios_textbox = {} + + for i_k in dict_text.keys(): + tokens = dict_text[i_k][1] + test_words = [word.lower() for word in tokens] # lowercase all tokens + test_words_set = set(test_words) + language_ratios = {} + for language in stopwords.fileids(): + if language in valid_lang: + stopwords_set = set(stopwords.words(language)) # For some languages eg. Russian, it would be a wise idea to tokenize the stop words by punctuation too. + common_elements = test_words_set.intersection(stopwords_set) + language_ratios[language] = len(common_elements) # language "score" + language_ratios_textbox[i_k] = language_ratios + + return language_ratios_textbox + + +# Simply, given the number of ocurrences of the stopwords, it assigns a label +# to a specific textbox, also considering the possibility of textboxes +# mixing languages. For this case, the value ratio_similar is intended + +def label_language(aux_dict_l): + ratio_similar = 0.8 + if sum(aux_dict_l.values()): + aux_dict_l_norm = {k: v / total for total in (sum(aux_dict_l.values()),) for k, v in aux_dict_l.items()} + lang_max_aux = max(aux_dict_l_norm.keys(), key=(lambda key: aux_dict_l_norm[key])) + lang_max = '' + count_l = 0 + for lang in aux_dict_l_norm.keys(): + if (aux_dict_l_norm[lang] > aux_dict_l_norm[lang_max_aux] * ratio_similar): + if count_l > 0: + lang_max += '_' + lang_max += lang + count_l += 1 + if count_l > 1: + lang_max = 'mixed_' + lang_max + else: + lang_max = 'NotIdentified' + return lang_max + + +# function to get list of places +def get_list_cantons(df_names): + df_temp = df_names.loc[df_names['type']=='canton'] + list_cantonname = list(df_temp['CantonName']) + list_cantonabbr = list(df_temp['CantonAbbreviation']) + list_citizenship = list(df_temp['Citizenship']) + list_citizenship = [city[:-5] for item in list_citizenship for city in item.split(',')] + list_firstname = list(df_temp['FirstName']) + + return list_cantonname + list_cantonabbr + list_citizenship + list_firstname diff --git a/src/python/utils_proc.py b/src/python/utils_proc.py new file mode 100644 index 00000000..5cf163fa --- /dev/null +++ b/src/python/utils_proc.py @@ -0,0 +1,193 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Sep 28 13:44:58 2018 + +@author: luissalamanca +""" + +# Just some useful various functions + +import os +import numpy as np +import git +import subprocess +import tarfile +from pdf2image import convert_from_path, convert_from_bytes +import re +import xml.etree.ElementTree as ET + +#git_repo = git.Repo(os.getcwd(), search_parent_directories=True) +#git_root = git_repo.git.rev_parse("--show-toplevel") + +PATH_PDF2TXT = 'pdf2txt.py' +#print(PATH_PDF2TXT) +#PATH_PDF2TXT = "/Users/luissalamanca/anaconda3/envs/py36/bin/pdf2txt.py" +HEIGHT_CHAR = 12 +WIDTH_CHAR = 6 + + +def call_with_out(full_comm): + ## call date command ## + #print(full_comm) + p = subprocess.Popen(full_comm, stdout=subprocess.PIPE, shell=True) + (output, err) = p.communicate() + p_status = p.wait() ## Wait for function to terminate. + #print("Command output : ", output) +# print("Command exit status/return code : ", p_status) + return output + +# This is a bit cumbersome, but wand.image is not importing the pdf from Spyder, +# but it is working from the terminal. Thus, I just run function +def pdf2png(input_file, res = 300, output_file = '', output_path = ''): + if len(output_file) == 0: + output_file = os.path.split(os.path.splitext(input_file)[0])[1] + if len(output_path) == 0: + output_path = os.path.split(os.path.splitext(input_file)[0])[0] + convert_from_path(input_file, dpi = res, output_folder = output_path, fmt = 'png') + +def pdf2xml(input_file, output_file = '', output_path = '', fmt = 'xml', suffix_str = '_data', + page_n = 1, flag_end = 0): + # Also to txt or html + if len(output_file) == 0: + output_file = os.path.split(os.path.splitext(input_file)[0])[1] + if len(output_path) == 0: + output_path = os.path.split(os.path.splitext(input_file)[0])[0] + ext = fmt + if fmt == 'text': + ext = 'txt' + + string_nums = np.array2string(page_n[0]) + for str in page_n[1:]: + string_nums = string_nums + ',' + np.array2string(str) + name_out = output_path + "/" + output_file + suffix_str + "." + ext + + if flag_end: + full_comm = PATH_PDF2TXT + " -o " \ + + name_out + " -t " + fmt + " -p " \ + + string_nums + " " + input_file + else: + full_comm = "python " + PATH_PDF2TXT + " " + input_file + " -o " \ + + name_out + " -t " + fmt + " -p " \ + + string_nums + + +# print(full_comm) + call_with_out(full_comm) + return name_out + +def get_list(year, folder_database, name_file): + with tarfile.open(folder_database + '/' + str(year) +'/' + name_file + '.tar.gz') as tar: + members = tar.getmembers() + + files_aux = [ + tarinfo for tarinfo in members + if tarinfo.isfile() + ] + + files = list() + list_ids = list() + for f in files_aux: + aux_l = str(f).split(' ')[1][1:-1] + files.append(aux_l) + list_ids.append(aux_l.split('/')[1]) + + list_ids = np.unique(np.array(list_ids)).tolist() + tar.close() + return files, list_ids + +def get_handlerfile(input_file, folder_database, name_file): + if input_file[:2] != './': + year = input_file.split('/')[0] + input_file = './' + input_file + else: + year = input_file.split('/')[1] + return tarfile.open(folder_database + '/' + str(year) +'/' + name_file + '.tar.gz').extractfile(input_file) + +def tar_extractfile(input_file, folder_database, name_file): + if input_file[:2] != './': + year = input_file.split('/')[0] + input_file = './' + input_file + else: + year = input_file.split('/')[1] + with tarfile.open(folder_database + '/' + str(year) +'/' + name_file + '.tar.gz') as tar: + tar.extract(input_file) + tar.close() + return input_file + +def addto_tar(input_file, folder_database, name_file): + if input_file[:2] != './': + year = input_file.split('/')[0] + input_file = './' + input_file + else: + year = input_file.split('/')[1] + name_tar = folder_database + '/' + str(year) +'/' + name_file + '.tar.gz' + if os.path.isfile(name_tar): + list_files_intar = get_list(year, folder_database, name_file)[0] + # Just to check if the file exists already inside the tar. In that case, we + # first extract everything, except for it, and tar everything again + if input_file in list_files_intar: + for in_f in list_files_intar: + if in_f!= input_file: + print(in_f, folder_database, name_file) + tar_extractfile(in_f, folder_database, name_file) + tf = tarfile.open(name_tar, mode="w") + for in_f in list_files_intar: + tf.add(in_f) + + else: + tf = tarfile.open(name_tar, mode="a") + tf.add(input_file) + else: + tf = tarfile.open(name_tar, mode="w") + tf.add(input_file) + tf.close() + return name_tar + + +def correct_metadata(year, id_doc, flag_end): + + keywords = ('In','Jahr','Band','Session','Rat','Sitzung','Geschäftsnummer', + 'Datum','Seite','Ref. No') + max_sep = 6 # Just a parameter to capture the textlines from the db file + + full_path = str(year) + '/' + str(id_doc) + command = 'cp ' + full_path + '.db ' + full_path + '_db.pdf' + call_with_out(command) + name_xml = pdf2xml(full_path + '_db.pdf', page_n = np.array([1]), suffix_str = '', flag_end = flag_end) + + XML_tree = ET.parse(name_xml) + XML_root = XML_tree.getroot() + + coord_textline = np.array([]).reshape((6,0)) + all_text_list = list() + for ind_el in range(len(XML_root[0])): + for ind_line in range(len(XML_root[0][ind_el])): + if XML_root[0][ind_el][ind_line].tag == 'textline': + coord_textline_aux = np.array(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')).astype(np.float64) + coord_textline_aux = np.concatenate((coord_textline_aux, np.array([ind_el,ind_line]))) + coord_textline = np.concatenate((coord_textline, np.array(coord_textline_aux).reshape((6,1))), axis = 1) + text_str = '' + aux_str = text_str.join([XML_root[0][ind_el][ind_line][ind_t].text for ind_t in range(len(XML_root[0][ind_el][ind_line]))]) + all_text_list.append(aux_str) + + xml_extrameta = ET.Element('META_FROM_DB') + for keyw in keywords: + #print(keyw) + ind_textl = np.min(np.argwhere(np.array([all_text_list]) == keyw + '\n')[:,1]) + coord_r = coord_textline[1,ind_textl] + distance = abs(coord_textline[1,:] - coord_r) + ind_valtextl = np.setdiff1d(np.argwhere(distance < max_sep),ind_textl)[0] + if ind_valtextl.size: + xml_extrameta.attrib[keyw.replace('. ','').upper()] = all_text_list[ind_valtextl][:-1] + else: + xml_extrameta.attrib[keyw.replace('. ','').upper()] = '' + + path_xml_meta = full_path + '.xml' + XML_tree_meta = ET.parse(path_xml_meta) + XML_root_meta = XML_tree_meta.getroot() + XML_root_meta[0].append(xml_extrameta) + + tree = ET.ElementTree(XML_root_meta) + tree.write(full_path + '_metacorr.xml', encoding = 'utf-8') + return full_path + '_metacorr.xml' \ No newline at end of file diff --git a/src/sh/execute_per_year.sh b/src/sh/execute_per_year.sh new file mode 100755 index 00000000..c65b67c2 --- /dev/null +++ b/src/sh/execute_per_year.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# The input variables are: +# 1 - name of python function to run +# 2 - folder of the database +# 3 - name of input file +# 4 - name of output file + +year_start=1891 +year_end=1894 + +for year in $(seq $year_start $year_end) +do + echo $year + renku run python $1 ${2}/$year/${3}.tar.gz ${2}/$year/${4}.tar.gz +done + -- GitLab