Skip to content
Snippets Groups Projects
Commit 91bcf101 authored by Lili Gasser's avatar Lili Gasser
Browse files

Merge branch 'master' into 'extract-speakers'

# Conflicts:
#   .gitattributes
parents 3aeb9135 a7176003
No related branches found
No related tags found
No related merge requests found
......@@ -319,3 +319,6 @@ data/AB/1947/05_annotatedxml.tar.gz filter=lfs diff=lfs merge=lfs -text
data/AB/1948/05_annotatedxml.tar.gz filter=lfs diff=lfs merge=lfs -text
data/AB/1949/05_annotatedxml.tar.gz filter=lfs diff=lfs merge=lfs -text
data/AB/1950/05_annotatedxml.tar.gz filter=lfs diff=lfs merge=lfs -text
data/AB/1976/04_correctedxml.tar.gz filter=lfs diff=lfs merge=lfs -text
data/AB/1977/04_correctedxml.tar.gz filter=lfs diff=lfs merge=lfs -text
data/AB/1975/04_correctedxml.tar.gz filter=lfs diff=lfs merge=lfs -text
......@@ -24,8 +24,9 @@ image_build:
dot:
stage: build
image: renku/singleuser:latest
image: renku/renku-python:latest
script:
- renku --version
- renku log --format dot $(git ls-files --no-empty-directory --recurse-submodules) > graph.dot
artifacts:
paths:
......
arguments: []
baseCommand:
- python
class: CommandLineTool
cwlVersion: v1.0
hints: []
inputs:
input_1:
default:
class: File
path: ../../src/python/run_correctxml.py
inputBinding:
position: 1
separate: true
shellQuote: true
streamable: false
type: File
input_2:
default:
class: File
path: ../../data/AB/1976/02_extractedxml.tar.gz
inputBinding:
position: 2
separate: true
shellQuote: true
streamable: false
type: File
input_3:
default: data/AB/1976/04_correctedxml.tar.gz
inputBinding:
position: 3
separate: true
shellQuote: true
streamable: false
type: string
outputs:
output_0:
outputBinding:
glob: $(inputs.input_3)
streamable: false
type: File
permanentFailCodes: []
requirements:
- class: InlineJavascriptRequirement
- class: InitialWorkDirRequirement
listing:
- entry: '$({"listing": [], "class": "Directory"})'
entryname: data/AB/1976
writable: true
successCodes: []
temporaryFailCodes: []
arguments: []
baseCommand:
- python
class: CommandLineTool
cwlVersion: v1.0
hints: []
inputs:
input_1:
default:
class: File
path: ../../src/python/run_correctxml.py
inputBinding:
position: 1
separate: true
shellQuote: true
streamable: false
type: File
input_2:
default:
class: File
path: ../../data/AB/1977/02_extractedxml.tar.gz
inputBinding:
position: 2
separate: true
shellQuote: true
streamable: false
type: File
input_3:
default: data/AB/1977/04_correctedxml.tar.gz
inputBinding:
position: 3
separate: true
shellQuote: true
streamable: false
type: string
outputs:
output_0:
outputBinding:
glob: $(inputs.input_3)
streamable: false
type: File
permanentFailCodes: []
requirements:
- class: InlineJavascriptRequirement
- class: InitialWorkDirRequirement
listing:
- entry: '$({"listing": [], "class": "Directory"})'
entryname: data/AB/1977
writable: true
successCodes: []
temporaryFailCodes: []
......@@ -16,4 +16,7 @@ RUN sudo apt-get install -y vim
# install spacy models
RUN python -m spacy download de_core_news_sm
RUN python -m spacy download fr_core_news_sm
RUN python -m spacy download xx_ent_wiki_sm
\ No newline at end of file
RUN python -m spacy download xx_ent_wiki_sm
# install nltk stopwords and punkt
RUN python -m nltk.downloader punkt && python -m nltk.downloader stopwords
\ No newline at end of file
File added
......@@ -34,6 +34,13 @@ folder_database = input_file.split(year_tocomp)[0]
t1 = time.time()
name_tar_file = input_file.split('/')[-1].split('.tar.gz')[0]
print(input_file)
comm = 'git lfs pull -I ' + input_file
utils_proc.call_with_out(comm)
input_file_pdf = '/'.join(input_file.split('/')[:-1]) + '/00_rawpdfs.tar.gz'
comm = 'git lfs pull -I ' + input_file_pdf
utils_proc.call_with_out(comm)
files_proc, _ = utils_proc.get_list(year_tocomp, folder_database, name_tar_file)
list_proc = list()
......@@ -44,12 +51,12 @@ for infile in files_proc:
if infile_aux not in list_proc:
list_proc.append(infile_aux)
d1 = defc.Document(infile_aux, folder_database)
try :
d1.correct_xml(flag_plots = 0, flag_parallel = 0, flag_save_figs = 0, name_outxml = name_tar_file,
name_outcorrxml = name_tar_out)
#try :
d1.correct_xml(flag_plots = 0, flag_parallel = 0, flag_save_figs = 0, name_outxml = name_tar_file,
name_outcorrxml = name_tar_out)
#print('Corrected %s' % infile)
except:
print("File to correct %s prompted an error" % infile)
#except:
# print("File to correct %s prompted an error" % infile)
# Commands to get the compressed version of the file
#data/AB/${year}/02_extractedxml.tar.gz
......
......@@ -12,5 +12,6 @@ for year in $(seq $year_start $year_end)
do
echo $year
$CONDA_DIR/envs/renku/bin/renku run --isolation python $1 ${2}/$year/${3}.tar.gz ${2}/$year/${4}.tar.gz
git push
done
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment