diff --git a/SB05_007_lassner_et_al/ocr_2021_001.png b/SB05_007_lassner_et_al/ocr_2021_001.png new file mode 100644 index 0000000000000000000000000000000000000000..fc0bafcdd33f6687d678c038ecd03a89bd6b3c10 Binary files /dev/null and b/SB05_007_lassner_et_al/ocr_2021_001.png differ diff --git a/SB05_007_lassner_et_al/ocr_2021_002.png b/SB05_007_lassner_et_al/ocr_2021_002.png new file mode 100644 index 0000000000000000000000000000000000000000..e575dce9d74dbb5f78fd0a498cb49bf44e292e47 Binary files /dev/null and b/SB05_007_lassner_et_al/ocr_2021_002.png differ diff --git a/SB05_007_lassner_et_al/ocr_2021_003.png b/SB05_007_lassner_et_al/ocr_2021_003.png new file mode 100644 index 0000000000000000000000000000000000000000..70840efbc75433e56e0c6ab7a8ce317ee4a98fb2 Binary files /dev/null and b/SB05_007_lassner_et_al/ocr_2021_003.png differ diff --git a/SB05_007_lassner_et_al/ocr_2021_004.png b/SB05_007_lassner_et_al/ocr_2021_004.png new file mode 100644 index 0000000000000000000000000000000000000000..71b4e4c36ca65baa6e942edf24669b816e4f7e98 Binary files /dev/null and b/SB05_007_lassner_et_al/ocr_2021_004.png differ diff --git a/SB05_007_lassner_et_al/ocr_2021_v1_0.pdf b/SB05_007_lassner_et_al/ocr_2021_v1_0.pdf new file mode 100644 index 0000000000000000000000000000000000000000..ffe586ddf569f34cfa052b75f18bec5d86488646 Binary files /dev/null and b/SB05_007_lassner_et_al/ocr_2021_v1_0.pdf differ diff --git a/SB05_007_lassner_et_al/ocr_2021_v1_0.xml b/SB05_007_lassner_et_al/ocr_2021_v1_0.xml new file mode 100644 index 0000000000000000000000000000000000000000..5f724f478169cb1065081a46058072952942f61e --- /dev/null +++ b/SB05_007_lassner_et_al/ocr_2021_v1_0.xml @@ -0,0 +1,2214 @@ +<?xml version="1.0" encoding="utf-8"?> +<?xml-model href="https://www.zfdg.de/sites/default/files/schema/tei_zfdg.rnc" type="application/relax-ng-compact-syntax" + ?> +<TEI xmlns="http://www.tei-c.org/ns/1.0" + xmlns:html="http://www.w3.org/1999/html" + xmlns:tei="http://www.tei-c.org/ns/1.0" + xmlns:xlink="http://www.w3.org/1999/xlink" + xmlns:xhtml="http://www.w3.org/1999/xhtml"> + <teiHeader> + <fileDesc> + <titleStmt> + <title> + <biblStruct> + <analytic> + <title level="a">Publishing an OCR ground truth data set for reuse in an unclear + copyright setting. Two case studies with legal and + technical solutions to enable a collective OCR ground truth data set effort</title> + + <respStmt> + <resp> + <persName> + <name role="marc_aut"> + <forename>David</forename> + <surname>Lassner</surname> + </name> + <email>lassner@tu-berlin.de</email> + <idno type="gnd">1246941414</idno> + <idno type="orcid">0000-0001-9013-0834</idno> + </persName> + </resp> + <orgName>Technische Universität Berlin, Machine Learning Group | The Berlin Institute for the Foundations of Learning and Data (BIFOLD)</orgName> + </respStmt> + <respStmt> + <resp> + <persName> + <name role="marc_aut"> + <forename>Julius</forename> + <surname>Coburger</surname> + </name> + <email>julius.coburger@gmx.de</email> + <idno type="gnd">124694197X</idno> + <idno type="orcid">0000-0003-4502-7955</idno> + </persName> + </resp> + <orgName>Technische Universität Berlin, Machine Learning Group</orgName> + </respStmt> + <respStmt> + <resp> + <persName> + <name role="marc_aut"> + <forename>Clemens</forename> + <surname>Neudecker</surname> + </name> + <email>clemens.neudecker@sbb.spk-berlin.de</email> + <idno type="gnd">1246943069</idno> + <idno type="orcid">0000-0001-5293-8322</idno> + </persName> + </resp> + <orgName>Staatsbibliothek zu Berlin – Preußischer Kulturbesitz</orgName> + </respStmt> + <respStmt> + <resp> + <persName> + <name role="marc_aut"> + <forename>Anne</forename> + <surname>Baillot</surname> + </name> + <email>anne.baillot@univ-lemans.fr</email> + <idno type="gnd">1065904681</idno> + <idno type="orcid">0000-0002-4593-059X</idno> + </persName> + </resp> + <orgName>Le Mans Université | École normale supérieure de Lyon, Interactions, Corpus, Apprentissages, Représentations - ICAR</orgName> + </respStmt> + <idno type="doi">10.17175/sb005_006</idno> + <idno type="ppn">1780168195</idno> + <idno type="zfdg">2021.006</idno> + <idno type="url">https://www.zfdg.de/node/340</idno> + <date when="2021-09-08">10.12.2021</date> + </analytic> + <monogr> + <title level="j">Zeitschrift für digitale Geisteswissenschaften</title> + <title level="m">Sonderband: Fabrikation von Erkenntnis – Experimente in den Digital Humanities.</title> + <respStmt> + <resp> + <name role="marc_edt"> + <forename>Manuel</forename> + <surname>Burghardt</surname> + </name> + <email>burghardt@informatik.uni-leipzig.de</email> + <idno type="gnd">1237665523</idno> + <idno type="orcid">0000-0003-1354-9089</idno> + </resp> + <orgName>Universität Leipzig</orgName> + </respStmt> + <respStmt> + <resp> + <name role="marc_edt"> + <forename>Lisa</forename> + <surname>Dieckmann</surname> + </name> + <email>lisa.dieckmann@uni-koeln.de</email> + <idno type="gnd">1077268289</idno> + <idno type="orcid">0000-0002-1708-7371</idno> + </resp> + <orgName>Universität zu Köln</orgName> + </respStmt> + <respStmt> + <resp> + <name role="marc_edt"> + <surname>Timo</surname> + <forename>Steyer</forename> + </name> + <email>t.steyer@tu-braunschweig.de</email> + <idno type="gnd">1053806175</idno> + <idno type="orcid">0000-0003-0218-2269</idno> + </resp> + <orgName>Technische Universität Braunschweig</orgName> + </respStmt> + <respStmt> + <resp> + <name role="marc_edt"> + <forename>Peer</forename> + <surname>Trilcke</surname> + </name> + <email>trilcke@uni-potsdam.de</email> + <idno type="gnd">139145117</idno> + <idno type="orcid">0000-0002-1421-4320</idno> + </resp> + <orgName>Universität Potsdam</orgName> + </respStmt> + <respStmt> + <resp> + <name role="marc_edt"> + <forename>Niels-Oliver</forename> + <surname>Walkowski</surname> + </name> + <email>niels-oliver.walkowski@uni.lu</email> + <idno type="gnd">1023378671</idno> + <idno type="orcid">0000-0003-3043-3010</idno> + </resp> + <orgName>Universität Luxemburg</orgName> + </respStmt> + <respStmt> + <resp> + <name role="marc_edt"> + <forename>Joëlle</forename> + <surname>Weis</surname> + </name> + <email>weis@hab.de</email> + <idno type="gnd">1233399721</idno> + <idno type="orcid">0000-0002-0080-4362</idno> + </resp> + <orgName>Forschungsverbund Marbach Weimar Wolfenbüttel</orgName> + </respStmt> + <respStmt> + <resp> + <name role="marc_edt"> + <forename>Ulrike</forename> + <surname>Wuttke</surname> + </name> + <email>wuttke@fhpotsdam.de</email> + <idno type="gnd">1107808405</idno> + <idno type="orcid">0000-0002-8217-4025</idno> + </resp> + <orgName>Fachhochschule Potsdam</orgName> + </respStmt> + <respStmt> + <resp>Publiziert von</resp> + <orgName role="marc_pbl">Herzog August Bibliothek</orgName> + </respStmt> + <respStmt> + <resp>Transformation der Word Vorlage nach TEI</resp> + <name role="marc_trc"> + <surname>Baumgarten</surname> + <forename>Marcus</forename> + <idno type="gnd">1192832655</idno> + </name> + </respStmt> + <availability status="free"> + <p>Available at <ref target="http://www.zfdg.de">https://www.zfdg.de</ref> + </p> + </availability> + <biblScope unit="sonderband">5</biblScope> + <biblScope unit="artikel">6</biblScope> + </monogr> + </biblStruct> + </title> + </titleStmt> + <editionStmt> + <edition>Elektronische Ausgabe nach TEI P5</edition> + </editionStmt> + <publicationStmt> + <distributor> + <name> + <orgName>Herzog August Bibliothek Wolfenbüttel</orgName> + </name> + </distributor> + <idno type="doi">10.17175/sb005</idno> + <idno type="ppn">1764792149</idno> + <idno type="url">https://www.zfdg.de/sonderband/5</idno> + <date when="2021-09-19">2021</date> + <authority> + <name>Herzog August Bibliothek</name> + <address> + <addrLine/> + </address> + </authority> + <authority> + <name>Forschungsverbund MWW</name> + <address> + <addrLine/> + </address> + </authority> + <availability status="free"> + <p> Sofern nicht anders angegeben </p> + <licence target="http://creativecommons.org/licenses/by/4.0/">CC BY SA + 4.0</licence> + </availability> + <availability status="free"> + <p> Available at <ref target="workID">https://www.zfdg.de"> (c) + Forschungsverbund MWW</ref> + </p> + </availability> + </publicationStmt> + <sourceDesc> + <p>Einreichung zum Call for Publications im Rahmen der vDHd21.</p> + </sourceDesc> + </fileDesc> + <encodingDesc> + <editorialDecl> + <p>Transformation der WORD-Vorlage nach XML/TEI-P5 durch die Oxgarage und eigenen + XSLT; Lektorat des Textes durch die Herausgeber*innen und die Redaktion der ZfdG.</p> + <p>Medienrechte liegen bei den Autor*innen.</p> + <p>All links checked<date when="2021">02.12.2021</date> + </p> + </editorialDecl> + </encodingDesc> + <profileDesc> + <creation>Einreichung für den Sonderband 5 der Zeitschrift für digitale + Geisteswissenschaften.</creation> + <langUsage> + <language ident="de">Text in Deutsch</language> + <language ident="de">Abstract in Deutsch</language> + <language ident="en">Abstract in Englisch</language> + </langUsage> + <textClass> + <keywords scheme="gnd"> + <term>Informatik<ref target="4026894-9"/> + </term> + <term>Maschinelles Lernen<ref target="4193754-5"/> + </term> + <term>Optische Zeichenerkennung<ref target="4310936-6"/> + </term> + <term>Urheberrecht<ref target="4062127-3"/> + </term> + </keywords> + </textClass> + </profileDesc> + <revisionDesc> + <change/> + </revisionDesc> + </teiHeader> + <text> + <body> + <div> + <div type="abstract"> + <argument xml:lang="de"> + <p>In dieser Arbeit stellen wir einen OCR-Trainingsdatensatz für + historische Drucke vor und zeigen, wie sich im Vergleich zu unspezifischen Modellen + die Erkennungsgenauigkeit verbessert, wenn sie mithilfe dieser Daten weitertrainiert + werden. Wir erörtern die Nachnutzbarkeit dieses Datensatzes anhand von zwei + Experimenten, die die rechtliche Grundlage zur Veröffentlichung digitalisierter + Bilddateien am Beispiel von deutschen und englischen Büchern des 19. Jahrhunderts + betrachten. Wir präsentieren ein Framework, mit dem OCR-Trainingsdatensätze + veröffentlicht werden können, auch wenn die Bilddateien nicht zur + Wiederveröffentlichung freigegeben sind.</p> + </argument> + </div> + <div type="abstract"> + <argument xml:lang="en"> + <p>We present an OCR ground truth data set for historical prints + and show improvement of recognition results over baselines with training on this + data. We reflect on reusability of the ground truth data set based on two + experiments that look into the legal basis for reuse of digitized document images in + the case of 19th century English and German books. We propose a framework for + publishing ground truth data even when digitized document images cannot be easily + redistributed. </p> + </argument> + </div> + <div type="chapter"> + <head>1. Introduction</head> + <p>Digital access to Cultural Heritage is + a key challenge for today’s society. It has been improved by <term type="dh">Optical Character Recognition</term> (OCR), which is the + task by which a computer program extracts text from a digital image in order to draw + the text from that image and present it in a machine-readable form. For historical + prints, off-the-shelf OCR solutions often result in inaccurate readings. Another + impediment to accessing digitized cultural heritage data consists in the fact that + cultural heritage institutions provide online access to massive amounts of digitized + images of historical prints that have not been (or have been poorly) OCRed. + Solutions to improve this situation would benefit a wide range of actors, be they + scholars or a general audience. Many actors would indeed profit greatly from methods + conceived to extract high quality machine-readable text from images.</p> + <p>The results of an OCR method can be + improved significantly by using a pre-trained model and fine-tuning it on only a few + samples that display similar characteristics.<note type="footnote"> See + <ref type="bibliography" target="#liebl_newspapers_2020">Liebl + / Burghardt 2020</ref>; <ref type="bibliography" target="#reul_learning_2017">Reul et al. 2017</ref>; + <ref type="bibliography" target="#springmann_truth_2018">Springmann et al. 2018</ref>.</note> To + that end, there has been a growing effort from the Digital Humanities community to + create and publish data sets for specific historical periods, languages and + typefaces aiming at enabling scholars to fine-tune OCR models for their collection + of historical documents.<note type="footnote"> See <ref type="bibliography" target="#padilla_report_2019">Padilla et al. 2019</ref>. For manuscripts, just recently the Transcriptiones platform launched, see + <ref target="https://www.librarylab.ethz.ch/project/transcriptiones/">transcriptiones</ref>, <ref type="bibliography" target="#eth_transcriptiones_2020">ETH-Library + 2020</ref>. For French texts from the 18th to the 21st century there exists HTR-United, see + <ref target="https://htr-united.github.io/">htr-united</ref>, <ref type="bibliography" target="#htr_united_2021">Chagué + / Clérice 2021</ref>. The slightly different approach + of just publishing fine-tuned models for different settings is proposed by + Transkribus, see <ref target="http://transkribus.eu/wiki/images/d/d6/Public_Models_in_Transkribus.pdf">Transkribus</ref>, + <ref type="bibliography" target="#readcoop_models_2021">READ-COOP + 2021</ref>, or <ref type="bibliography" target="#kraken_git_2021">Kraken 2021</ref> + <ref target="https://zenodo.org/communities/ocr_models/">ocr_models</ref>, <ref type="bibliography" target="#ocr_model_2021">OCR/HTR model + repository 2021</ref>.</note> In Germany, the DFG-funded OCR-D initiative + brings together major research libraries with the goal to create an open source + framework for the OCR of historical printed documents, including specifications and + guidelines for OCR ground truths.<note type="footnote"> See + <ref type="bibliography" target="#engl_volltexte_2020">Engl + 2020</ref>.</note> + </p> + <p>In order to improve OCR results, + images and the corresponding transcriptions are collected in such a way that each + pair (image and text) only represents one line of text from the original page. This + is called a ground truth data set and is precisely what we will focus on in the + following.</p> + <p>Besides the fact that creating + transcriptions of images manually is tedious work, another major issue arises from + this type of collective effort in that the institutions that produce the scan often + claim some form of copyright to it. For example, on the first page of any of their + PDFs, Google Books <quote>[…] request[s] that you use these files for + personal, non-commercial purposes</quote><note type="footnote"> <ref type="bibliography" target="#google_information_2021">Google Inc. 2021</ref>, cited after <ref type="bibliography" target="#ruiz_agreement_2011">Ruiz 2011</ref>.</note>. As a consequence, a scholar aiming to create an OCR + ground truth data set would not know with certainty whether the rights to + redistribute the textline images derived from the PDF can be considered as + granted.</p> + <p>In this paper, we present an OCR + ground truth data set with an unclear copyright setting for the image data. We + discuss the legal background, show the relevance of the data set and provide + in-depth analysis of its constitutiq on and reuse by investigating two different + approaches to overcome the copyright issues.</p> + <p>In order to address these issues, we + compare in the following two ways to publish the OCR ground truth data set with + image data. </p> + <list type="unordered"> + <item>As Google Books works with cultural heritage institutions (CHIs) to digitize + books, we asked permission from the CHIs to redistribute the image data. </item> + <item>We published a data set formula, which consists of the transcriptions, links + to the image sources, and a description on how to build the data set. For this + process, we provide a fast, highly automated framework that enables others to + reproduce the data set. </item> + </list> + </div> + <div type="chapter"> + <head>2. Legal background + and its interpretation at CHIs</head> + <p>Clarifying the copyright situation for + the scans of a book collection requires to take into account, for each book, the + cultural heritage institution owning the book (usually a library), and, in the case + of private-public partnerships, also the scanning institution (e. g. Google Books) + involved in its digitization. For Google Books, there exist different contracts + between CHIs and Google, and not all of them are open to public inspection. However, + based on comparing the ones that are available, we assume that other contracts are + to some extent similar (see <ref type="intern" target="#hd16">List of Contracts</ref>). The + contracts contain information on the ›Library Digital Copy‹ for which non-profit + uses are defined under Section 4.8 (cf. British Library Google Contract), which + states that a </p> + <p> + <quote type="grosszitat">Library may provide all or any + portion of the Library Digital Copy, that is [...] a Digital Copy of a Public + Domain work to (a) academic institutions or research libraries, or (b) when + requested by Library and agreed upon in writing by Google, other not-for-profit + or government entities that are not providing search or hosting services + substantially similar to those provided by Google.</quote><note type="footnote"> British Library Google Books Agreement in <ref type="bibliography" target="#ruiz_agreement_2011">Ruiz + 2011.</ref></note> + </p> + <p>When trying to unpack this legal + information against the use case presented here, multiple questions arise. What are + the legal possibilities for individual scholars regarding the use of the Library + Digital Copy of a Public Domain work? How can there be limitations in the use of a + Public Domain work? Is the use case of OCR model training substantially similar to + any search or hosting services provided by Google? Would and can libraries act as + brokers in negotiating written agreements about not-for-profit use with Google?</p> + <p>In the continuation of Section 4.8, + additional details are specified with regard to data redistribution by ›Additional + institutions‹ where </p> + <p> + <quote type="grosszitat">[a written agreement with + Google] will prohibit such Additional institution from redistributing [...] + portions of the Library Digital Copy to other entities (beyond providing or + making content available to scholars and other users for educational or research + purposes.</quote><note type="footnote"> + British Library Google Books Agreement in <ref type="bibliography" target="#ruiz_agreement_2011">Ruiz 2011</ref>.</note> + </p> + <p>This brings up further questions but + also opens the perspective a bit, since there appear to be exceptions for <quote>scholars and other users for educational or research + purposes</quote><note type="footnote"> + British Library Google Books Agreement in <ref type="bibliography" target="#ruiz_agreement_2011">Ruiz 2011</ref>.</note>, which is a precise fit of the use case we + present here. Now what does this mean in practice? Digital Humanities scholars are + not necessarily legal experts, so how do libraries that have entered + public-private-partnerships with Google for digitization of Public Domain works + implement these constraints? Schöch et al. discuss a wide range of use cases in the + area of text and data mining with copyright protected digitized documents, but they + do not cover the creation and distribution of ground truth.<note type="footnote"> See + <ref type="bibliography" target="#schoech_textformate_2020">Schöch + et al. 2020</ref>.</note> In other scenarios that involve copyrighted texts + published in derived formats, one question typically preventing redistribution is + whether it is possible to re-create the (copyright-protected) work from the derived + parts. In the case of textline ground truth, it is however likely that this would + constitute a violation of such a principle. In this unclear setting, scholars are in + need of support and guidance by CHIs.</p> + <table> + <row> + <cell>Institution</cell> + <cell>Total # books</cell> + <cell>Total # pages</cell> + <cell>Response time (# working days)</cell> + <cell>Allowed to publish as part of the + paper</cell> + <cell>Allowed to license</cell> + <cell>Alternative source</cell> + <cell>Responsible</cell> + <cell>Citation needed</cell> + </row> + + + <row> + <cell>Bayerische Staatsbibliothek</cell> + <cell>4</cell> + <cell>12</cell> + <cell>3</cell> + <cell>yes</cell> + <cell>yes</cell> + <cell>yes</cell> + <cell>yes</cell> + <cell>yes</cell> + </row> + + + <row> + <cell>Biblioteca Statale Isontina Gorizia</cell> + <cell>1</cell> + <cell>3</cell> + <cell>–</cell> + <cell>–</cell> + <cell>–</cell> + <cell>–</cell> + <cell>–</cell> + <cell>–</cell> + </row> + + + <row> + <cell>Bodleian Library</cell> + <cell>11</cell> + <cell>20</cell> + <cell>2</cell> + <cell>yes, + alternative</cell> + <cell>already CC-BY-NC</cell> + <cell>yes</cell> + <cell>yes</cell> + <cell>yes</cell> + </row> + + + <row> + <cell>British Library</cell> + <cell>1</cell> + <cell>35</cell> + <cell>4</cell> + <cell>no</cell> + <cell>no</cell> + <cell>no</cell> + <cell>yes</cell> + <cell>–</cell> + </row> + + + <row> + <cell>Harvard University, Harvard College Library</cell> + <cell>1</cell> + <cell>3</cell> + <cell>0</cell> + <cell>yes</cell> + <cell>yes</cell> + <cell>yes</cell> + <cell>no</cell> + <cell>yes</cell> + </row> + + + <row> + <cell>New + York Public Library</cell> + <cell>5</cell> + <cell>29</cell> + <cell>3</cell> + <cell>–</cell> + <cell>–</cell> + <cell>no</cell> + <cell>no</cell> + <cell>no</cell> + </row> + + + <row> + <cell>Austrian National Library</cell> + <cell>2</cell> + <cell>6</cell> + <cell>10</cell> + <cell>yes, + alternative</cell> + <cell>no</cell> + <cell>yes</cell> + <cell>yes</cell> + <cell>yes</cell> + </row> + + + <row> + <cell>Robarts – University of Toronto</cell> + <cell>2</cell> + <cell>3</cell> + <cell>–</cell> + <cell>–</cell> + <cell>–</cell> + <cell>–</cell> + <cell>–</cell> + <cell>–</cell> + </row> + + + <row> + <cell>University of Illinois Urbana-Champaign</cell> + <cell>6</cell> + <cell>4</cell> + <cell>0</cell> + <cell>yes</cell> + <cell>yes</cell> + <cell>no</cell> + <cell>yes</cell> + <cell>yes</cell> + </row> + + + <row> + <cell>University of Wisconsin – Madison</cell> + <cell>8</cell> + <cell>24</cell> + <cell>2</cell> + <cell>yes</cell> + <cell>yes</cell> + <cell>no</cell> + <cell>no</cell> + <cell>no</cell> + </row> + <trailer xml:id="tab01"> + <ref type="intern" target="#tab1">Tab. 1</ref>: Responses of library institutions to our request to + grant permission to publish excerpts of the scans for which they were contractors of + the digitization. Most institutions responded within a few working days and except + for the fact that most acknowledged the public domain of the items, the responses + were very diverse. Many answered that they are either not responsible or only + responsible for their Library Copy of the PDF. [Lassner et al. 2021] + <ref type="graphic" target="#ocr_2021_t1"/> + </trailer> + </table> + + <p>We have asked ten CHIs for permission + to publish image data that was digitized based on their collection in order to + publish them as part of an OCR ground truth data set under a CC-BY license. As shown + in <ref type="graphic" target="#tab01">Table 1</ref>, the institutions gave a wide + variety of responses. Many institutions acknowledged that the requested books are in + the public domain because they were published before the year 1880. However, there + is no general consensus on whether the CHIs are actually responsible for granting + these rights, especially if one wants to use the copy from the Google Books or + Internet Archive servers. Some institutions stated that they are only responsible + for their Library Copy of the scan and granted permission to publish only from that + source. Only two institutions, the Bayerische Staatsbibliothek and University of + Illinois Urbana-Champaign stated that they are responsible and that we are allowed + to also use the material that can be found on the Google Books or Internet Archive + servers. </p> + <p>This case study underlines the lack of + a clear and simple framework of reference that would be recognized and applied, and + would reflect on good practices in the relationships between CHIs and digital + scholarship. The lack of such a framework is addressed among others by the DARIAH + initiative of the Heritage Data Reuse Charter<note type="footnote"> See + <ref type="bibliography" target="#baillot_data_2016">Baillot + et al. 2016</ref>. For additional information on the DARIAH Heritage Data Reuse + Charter, see <ref target="https://www.dariah.eu/activities/open-science/data-re-use/">data-re-use</ref>, + <ref type="bibliography" target="#heritage_data_2021">DARIAH 2021</ref>.</note> that was + launched in 2017. Another approach towards such a framework is that of the ›digital + data librarian‹.<note type="footnote"> + See + <ref type="bibliography" target="#eclevia_data_2019">Eclevia + et al. 2019</ref>.</note> + </p> + </div> + <div type="chapter"> + <head>3. Description of the + data set </head> + <p>In the data set that we want to + publish in the context of our OCR ground truth, we do not own the copyright for the + image data.<note type="footnote"> The current version of the data set can be found at + <ref target="https://github.com/millawell/ocr-data/tree/master/data">ocr-data/data</ref>, + <ref type="bibliography" target="#ocr_data_2021">OCR-Data 2021</ref>.</note> We therefore + distinguish between the data set formula and the built data set. We publish the data + set formula which contains the transcriptions, the links to the images and a recipe + on how to build the data set.</p> + <p>The data set formula and source code + are published on Github<note type="footnote"> See <ref target="https://github.com/millawell/ocr-data/">ocr-data</ref>, + <ref type="bibliography" target="#ocr_data_2021">OCR-Data 2021</ref>.</note> and the version + 1.1 we are referring to in this paper is mirrored on the open access repository + Zenodo.<note type="footnote"> See + <ref type="bibliography" target="#lassner_data_2021">Lassner + et al. 2021</ref>.</note> The data set is published under a CC-BY 4.0 license + and the source code is published under an Apache license.</p> + <div type="subchapter"> + <head>3.1 Origin</head> + <p>The built data set contains images + from editions of books by Walter Scott and William Shakespeare in the original + English and in translations into German that were published around 1830. </p> + <p>The data set was created as part of a + research project that investigates how to implement stylometric methods that are + commonly used to analyze the style of authors with the goal of analyzing that of + translators. The data set was organized in such a way that other variables like + authors of the documents or publication date can be ruled out as a confounder of the + translator style. </p> + <p>We found that 1830 Germany was + especially suitable for the research setting we had in mind. Due to an increased + readership in Germany around 1830, there was a growing demand in books. Translating + foreign publications into German turned out to be particularly profitable because, + at that time, there was no copyright regulation that would apply equally across + German-speaking states. There was no general legal constraint to regulate payments + to the original authors of books or as to who was allowed to publish a German + translation of a book. Therefore, publishers were competing in translating most + recent foreign works into German, which resulted in multiple German translations by + different translators of the same book at the same time. To be the first one to + publish a translation into German, publishers resorted to what was later called + translation factories, optimized for translation speed.<note type="footnote"> See + <ref type="bibliography" target="#bachleiter_uebersetzungsfabriken_1989">Bachleitner + 1989</ref>.</note> The translators working in such ›translation factories‹ + were not specialized in the translation of one specific author. It is in fact not + rare to find books from different authors translated by the same translator.</p> + </div> + <div type="subchapter"> + <head>3.2 Method</head> + <p>We identified three translators who + all translated books from both Shakespeare and Scott, sometimes even the same books. + We also identified the English editions that were most likely to have been used by + the translators. This enabled us to set up a book-level parallel English-German + corpus allowing us to, again, rule out the confounding author signal.</p> + <p>As the constructed data set is only + available in the form of PDFs from Google Books and the Internet Archive or the + respective partner institutions, OCR was a necessary step for applying stylometric + tools on the text corpus. To assess the quality of off-the-shelf OCR methods and to + improve the OCR quality, for each book, a random set of pages was chosen for manual + transcription. </p> + <div type="subchapter"> + <head>3.2.1 Preparation</head> + <p>Following the OCR-D initiative’s + specifications and best practices,<note type="footnote"> See <ref target="https://ocr-d.de/en/spec/">ocr-d + spec</ref>, <ref type="bibliography" target="#ocrd_ocrd_2021">OCR-D + 2021</ref>.</note> for each book, we created a METS<note type="footnote"> See <ref target="http://www.loc.gov/standards/mets/">METS</ref>, + <ref type="bibliography" target="#mets_loc_2021">The Library of Congress + 2021</ref>.</note> file that contains the link to the source PDF as + well as the chosen pages. The following example presents an excerpt from one of the + METS files:</p> + <figure> + <graphic xml:id="ocr_2021_001" url=".../medien/ocr_2021_001.png"> + <desc> + <ref type="graphic" target="#abb1">Fig. 1</ref>: Excerpt of a METS file as used in our data set. For + each book, we created one METS file. The link to the resource contains the + identifier and the page number. [<ref type="bibliography" target="#lassner_data_2021">Lassner + et al. 2021</ref>]<ref + type="graphic" target="#ocr_2021_001"/> + </desc> + </graphic> + </figure> + <p>The PDFs have been downloaded from the + URLs in this METS file, and the page images have been extracted from the PDF, + deskewed and saved as PNG files.<note type="footnote"> The process is implemented in the pdfs.py submodule + <ref target="https://github.com/millawell/ocr-data/blob/master/utils/pdfs.py#L23">pdfs.py:23</ref> and it uses the + command line tools imagemagick and pdfimages, see <ref type="bibliography" target="#ocr_data_2021">OCR-Data 2021</ref>.</note> + </p> + </div> + <div type="subchapter"> + <head>3.2.2 Transcription</head> + + <p>For transcription, the standard layout + analyzer of Kraken 2.0.8 (depending on the layout either with black or white column + separators) has been used and the transcription was pre-filled with either the + German Fraktur or the English off-the-shelf model and post-corrected manually. To + ensure consistency, some characters were normalized: for example, we encountered + multiple hyphenation characters such as <hi rend="bold">-</hi> and + <hi rend="bold">⸗</hi> which were both transcribed by <hi rend="bold">-</hi>.</p> + </div> + <div type="subchapter"> + <head>3.2.3 Size</head> + <p>In total, the data set contains 5,354 + lines with 224,745 characters. It consists of German and English books from 1815 to + 1852. A detailed description of the characteristics of the data set is shown in <ref type="graphic" target="#tab02">Table 2</ref>.</p> + </div> + </div> + <div type="subchapter"> + <head>3.3 Reproducibility and Accessibility</head> + <p>The data set formula has been + published as a collection of PAGE files and METS files.<note type="footnote"> See + <ref type="bibliography" target="#pletschacher_page_2010">Pletschacher + / Antonacopoulos 2010</ref>.</note> The PAGE files contain the transcriptions + on line-level and the METS files serve as the container linking metadata, PDF + sources and the transcriptions. There exists one METS file per item (corresponding + to a Google Books or Internet Archive id) and one PAGE file per PDF page. The + following excerpt of an example PAGE file shows how to encode one line of text:</p> + <figure> + <graphic xml:id="ocr_2021_002" url=".../medien/ocr_2021_002.png"> + <desc> + <ref type="graphic" target="#abb2">Fig. 2</ref>: Excerpt from the PAGE file showing the bounding box of + the line on the page image and the corresponding text string. [<ref type="bibliography" target="#lassner_data_2021">Lassner + et al. 2021</ref>]<ref + type="graphic" target="#ocr_2021_002"/> + </desc> + </graphic> + </figure> + <p>The <code><TextLine></code> contains the absolute pixel coordinates where the text is + located on the preprocessed PNG image and the <code><TextEquiv></code> holds the transcription of the line.</p> + <p>As shown above, the METS files contain + links to the PDFs. Additionally, the METS files contain links to the PAGE files as + shown in the following excerpt. </p> + <figure> + <graphic xml:id="ocr_2021_003" url=".../medien/ocr_2021_003.png"> + <desc> + <ref type="graphic" target="#abb3">Fig. 3</ref>: Excerpt from the METS file as used in our data set. For + each book, we created one METS file. This part of the METS file contains the + references to the PAGE files. [<ref type="bibliography" target="#lassner_data_2021">Lassner + et al. 2021</ref>]<ref + type="graphic" target="#ocr_2021_003"/> + </desc> + </graphic> + </figure> + <p>As one can see, there are links from + one METS file, namely the one encoding works by Walter Scott’s, Volume 2, published + by the Schumann brothers in 1831 in Zwickau, identified by the Google Books id <code>2jMfAAAAMAAJ</code>, to multiple pages (and PAGE files).</p> + <p>Finally, the METS file contains the + relationship between the URLs and the PAGE files in the <code><mets:structMap></code> section of the file:</p> + <figure> + <graphic xml:id="ocr_2021_004" url=".../medien/ocr_2021_004.png"> + <desc> + <ref type="graphic" target="#abb4">Fig. 4</ref>: Excerpt from the METS file as used in our data set. For + each book, we created one METS file. Together with the links to the image resources + shown in <ref type="graphic" target="#ocr_2021_001">Figure 1</ref>, and the links to the PAGE + files, the METS file holds the connection between the text lines and the page + images. [<ref type="bibliography" target="#lassner_data_2021">Lassner + et al. 2021</ref>]<ref + type="graphic" target="#ocr_2021_004"/> + </desc> + </graphic> + </figure> + <p>In order to reuse the data set, a + scholar may then obtain the original image resources from the respective + institutions as PDFs, based on the links we provide in the METS files. Then, the + pair data set can be created by running the ›make pair_output‹ command in the + ›pipelines/‹ directory. For each title, it extracts the PNG images from the PDF, + preprocesses them, extracts, crops and saves the line images along respective files + containing the text of the line.</p> + <p>Although the image data needs to be + downloaded manually, the data set can still be compiled within minutes. </p> + + </div> + </div> + + <div type="chapter"> + <head>4. Framework for + creating, publishing and reusing OCR ground truth data</head> + <p>We have published the framework we + developed for the second case study, which enables scholars to create and share + their own ground truth data set formulas when they are in the same situation of not + owning the copyright for the images they use. This framework offers both directions + of functionality: </p> + <list type="unordered"> + <item>Creating an XML ground truth data set from transcriptions to share it with the + public (data set formula) and </item> + <item>Compiling an XML ground truth data set into standard OCR ground truth data + pairs to train an OCR model (built data set).<note type="footnote"> The documentation how to create a new or reproduce an + existing data set can be found at <ref target="https://github.com/millawell/ocr-data/blob/master/README.md">README.md</ref>, + <ref type="bibliography" target="#ocr_data_2021">OCR-Data 2021</ref>.</note> + </item> + </list> + <p>As already described in the + <ref type="intern" target="#hd5">Sections 3.2</ref> and <ref type="intern" target="#hd9">3.3</ref> there are multiple + steps involved in the creation, publication and + reuse of the OCR data set. In this Section, we would like to show that our work is + not only relevant for scholars who want to reuse our data set but also for scholars + who would like to publish a novel OCR ground truth data set in a similar copyright + setting. </p> + <div type="subchapter"> + <head>4.1 Creation and + Publication</head> + <list type="ordered"> + <item>Corpus construction: selection of the relevant books and pages</item> + <item>Creation of the METS files<note type="footnote"> See <ref target="https://github.com/millawell/ocr-data/blob/master/data/mets_page_template.xml">mets_page_template.xml</ref>, + <ref type="bibliography" target="#ocr_data_2021">OCR-Data 2021</ref>.</note> + </item> + <item>Transcription of the pages</item> + <item>Creation of the PAGE files<note type="footnote"> See <ref target="https://github.com/millawell/ocr-data/blob/master/pipelines/create_xml_files.py"> + create_xml_files.py</ref>, <ref type="bibliography" target="#ocr_data_2021">OCR-Data 2021</ref>.</note> + </item> + <item>Publication of the METS and the PAGE files</item> + </list> + </div> + <div type="subchapter"> + <head>4.2 Reuse</head> + <list type="ordered"> + <item>Download of the METS and PAGE files</item> + <item>Download of the PDFs as found in the METS files</item> + <item>Creation of the pair data set<note type="footnote"> See <ref target="https://github.com/millawell/ocr-data/blob/master/pipelines/extract_pair_dataset.py">extract_pair_dataset.py</ref>, + <ref type="bibliography" target="#ocr_data_2021">OCR-Data 2021</ref>.</note> + </item> + <item>Training of the OCR models<note type="footnote"> See <ref target="https://github.com/millawell/ocr-data/blob/master/pipelines/train_ocr_model.py"> + train_ocr_model.py</ref>, <ref type="bibliography" target="#ocr_data_2021">OCR-Data 2021</ref>.</note> + </item> + </list> + <p>In the <ref type="intern" target="#hd9">Section 3.3</ref>, the steps listed in Reuse have been + described. The download of the transcriptions and the PDFs has to be done manually + but for the creation of the pair data set and the training of the models, automation + is provided with our framework. We would like to also automatize the download of the + PDFs; this, however, remains complicated to implement. The first reason for this is + a technical one: soon after starting the download, captchas appear (as early as by + the 3rd image), which hinders the automatization. + Another reason is the Google Books regulation itself. Page one of any Google Books + PDF states explicitly: </p> + <p> + <quote type="grosszitat">Keine automatisierten Abfragen. + Senden Sie keine automatisierten Abfragen irgendwelcher Art an das + Google-System. Wenn Sie Recherchen über maschinelle Übersetzung, optische + Zeichenerkennung oder andere Bereiche durchführen, in denen der Zugang zu Text + in großen Mengen nützlich ist, wenden Sie sich bitte an uns. Wir fördern die + Nutzung des öffentlich zugänglichen Materials für diese Zwecke und können Ihnen + unter Umständen helfen.</quote><note type="footnote">When downloading any book + PDF from Google Books one page is prepended to the document. On this page, + the cited usage statement is presented. As an example, please consider + <ref target="https://books.googleusercontent.com/books/content?req=AKW5QacqJ1ytah-8JsyWYKfgLVnZGMYKbDlV_xg2ynjx_ + aaepDsn3n6q0CnzACs-ZyfZHd6O2QajiTZGiS8jng4nnH5kyY3xFjFOMbcRxaq1KF15JPVAQl-6en4LlMhGvzXe13qX2haJnRTvVGDAUa4W9_ + JG8toPUCCfVbqL8TF-GshZr4L9EgHZ6W4g2xUGqbRJjAs0ImImKkWhSDTUi-8jGATaViIV5xgVreVUKA4lgwFYxhpesnqlPwpOIDkJW8w3m0ztj49FPsVRDx8aepxC39l-b1Apuw">Walter Scott's Werke</ref>, + see <ref type="bibliography" target="#google_informationen_2006">Google Inc. 2006</ref>.</note> + </p> + <p>Finding a way to automatize download + could hence not be realized in the context of this project and will have to be + addressed in future work.<note type="footnote"> Our progress on this topic will be documented in issue 2 of our + <ref target="https://github.com/millawell/ocr-data/issues/2">github repository</ref>, see + <ref type="bibliography" target="#ocr_data_2021">OCR-Data 2021</ref>. </note> + </p> + <p>Additionally, we provide useful + templates and automation for the creation of a novel OCR ground truth data set. As + already described, we used the Kraken transcription interface to create the + transcription. In Kraken, the final version of the transcription is stored in HTML + files. We provide a script to convert the HTML transcriptions into PAGE files in + order to facilitate interoperability with other OCR ground truth data sets.</p> + <p>Finally, the pair data set can be + created from the PAGE transcriptions and the images of the PDFs and the OCR model + can be trained.</p> + </div> + </div> + <div type="chapter"> + <head>5. Relevance of the + data set</head> + <p>In order to evaluate the impact that + the data set has on the accuracy of OCR models, we trained and tested model + performance in three different settings. In the first setting, we fine-tuned an + individual model for each book in our corpus using a training and an evaluation set + of that book and tested the performance of the model on a held-out test set from the + same book. In <ref type="graphic" target="#tab02">Table 2</ref>, we show how this data + set has dramatically improved the OCR accuracy on similar documents compared to + off-the-shelf OCR solutions. Especially in cases where the off-the-shelf model + (baseline) shows a weak performance, the performance gained by fine-tuning is + large.</p> + <p>In the second and third setting, we + split the data set into two groups: English Antiqua, German Fraktur. There was also + one German Antiqua book that we did not put into any of the two groups. For the + second setting, we split all data within a group randomly into train set, evaluation + set and test set and trained and tested an individual model for each group. In <ref type="graphic" target="#tab03">Table 3</ref>, the test performance of this setting + is shown. For both groups, the fine-tuning improves the character accuracy by a + large margin over the baseline accuracy. This experiment shows that overall, the + fine-tuning within a group improves the performance of that group and that patterns + are learned across individual books.</p> + <table> + + <row> + <cell>Google Books or + Internet Archive identifier</cell> + <cell>baseline + model</cell> + <cell>Train # + lines</cell> + <cell>Test # + lines</cell> + <cell>Train # + chars</cell> + <cell>Test # + chars</cell> + <cell>baseline + character accuracy</cell> + <cell>fine-tuned + character accuracy</cell> + <cell>δ</cell> + </row> + + + <row> + <cell>rDUJAAAAQAAJ</cell> + <cell>en_best</cell> + <cell>82</cell> + <cell>11</cell> + <cell>3520</cell> + <cell>493</cell> + <cell>99.8</cell> + <cell>100.0</cell> + <cell>0.2</cell> + </row> + + + <row> + <cell>chroniclesofcano02scot</cell> + <cell>en_best</cell> + <cell>20</cell> + <cell>3</cell> + <cell>836</cell> + <cell>97</cell> + <cell>100.0</cell> + <cell>100.0</cell> + <cell>0.0</cell> + </row> + + + <row> + <cell>anneofgeierstein03scot</cell> + <cell>en_best</cell> + <cell>20</cell> + <cell>3</cell> + <cell>805</cell> + <cell>138</cell> + <cell>100.0</cell> + <cell>100.0</cell> + <cell>0.0</cell> + </row> + + + <row> + <cell>_QgOAAAAQAAJ</cell> + <cell>en_best</cell> + <cell>60</cell> + <cell>8</cell> + <cell>2659</cell> + <cell>359</cell> + <cell>95.54</cell> + <cell>100.0</cell> + <cell>4.46</cell> + </row> + + + <row> + <cell>chroniclesofcano03scot</cell> + <cell>en_best</cell> + <cell>40</cell> + <cell>5</cell> + <cell>1766</cell> + <cell>185</cell> + <cell>99.46</cell> + <cell>99.46</cell> + <cell>0.0</cell> + </row> + + + <row> + <cell>zviTtwEACAAJ</cell> + <cell>fraktur_1_best</cell> + <cell>66</cell> + <cell>9</cell> + <cell>3396</cell> + <cell>519</cell> + <cell>98.27</cell> + <cell>99.23</cell> + <cell>0.96</cell> + </row> + + + <row> + <cell>quentindurward02scotuoft</cell> + <cell>en_best</cell> + <cell>39</cell> + <cell>5</cell> + <cell>1748</cell> + <cell>241</cell> + <cell>99.17</cell> + <cell>99.17</cell> + <cell>0.0</cell> + </row> + + + <row> + <cell>3pVMAAAAcAAJ</cell> + <cell>fraktur_1_best</cell> + <cell>92</cell> + <cell>12</cell> + <cell>4830</cell> + <cell>598</cell> + <cell>96.49</cell> + <cell>99.16</cell> + <cell>2.67</cell> + </row> + + + <row> + <cell>2jMfAAAAMAAJ</cell> + <cell>fraktur_1_best</cell> + <cell>157</cell> + <cell>20</cell> + <cell>7386</cell> + <cell>939</cell> + <cell>93.5</cell> + <cell>98.94</cell> + <cell>5.44</cell> + </row> + + + <row> + <cell>t88yAQAAMAAJ</cell> + <cell>fraktur_1_best</cell> + <cell>84</cell> + <cell>11</cell> + <cell>3345</cell> + <cell>436</cell> + <cell>94.5</cell> + <cell>98.85</cell> + <cell>4.35</cell> + </row> + + + <row> + <cell>HCRMAAAAcAAJ</cell> + <cell>fraktur_1_best</cell> + <cell>125</cell> + <cell>16</cell> + <cell>5100</cell> + <cell>579</cell> + <cell>92.23</cell> + <cell>98.79</cell> + <cell>6.56</cell> + </row> + + + <row> + <cell>zDTMtgEACAAJ</cell> + <cell>fraktur_1_best</cell> + <cell>76</cell> + <cell>10</cell> + <cell>4277</cell> + <cell>560</cell> + <cell>93.93</cell> + <cell>98.75</cell> + <cell>4.82</cell> + </row> + + + <row> + <cell>DNUwAQAAMAAJ</cell> + <cell>fraktur_1_best</cell> + <cell>76</cell> + <cell>10</cell> + <cell>4147</cell> + <cell>517</cell> + <cell>94.58</cell> + <cell>98.45</cell> + <cell>3.87</cell> + </row> + + + <row> + <cell>H9UwAQAAMAAJ</cell> + <cell>fraktur_1_best</cell> + <cell>76</cell> + <cell>10</cell> + <cell>4017</cell> + <cell>533</cell> + <cell>97.19</cell> + <cell>98.31</cell> + <cell>1.12</cell> + </row> + + + <row> + <cell>AdiKyqdlp4cC</cell> + <cell>fraktur_1_best</cell> + <cell>77</cell> + <cell>10</cell> + <cell>2827</cell> + <cell>405</cell> + <cell>92.84</cell> + <cell>98.27</cell> + <cell>5.43</cell> + </row> + + + <row> + <cell>J4knAAAAMAAJ</cell> + <cell>en_best</cell> + <cell>20</cell> + <cell>3</cell> + <cell>851</cell> + <cell>104</cell> + <cell>97.12</cell> + <cell>98.08</cell> + <cell>0.96</cell> + </row> + + + <row> + <cell>aNQwAQAAMAAJ</cell> + <cell>fraktur_1_best</cell> + <cell>52</cell> + <cell>7</cell> + <cell>2752</cell> + <cell>309</cell> + <cell>95.79</cell> + <cell>98.06</cell> + <cell>2.27</cell> + </row> + + + <row> + <cell>XtEyAQAAMAAJ</cell> + <cell>fraktur_1_best</cell> + <cell>86</cell> + <cell>11</cell> + <cell>3489</cell> + <cell>383</cell> + <cell>94.52</cell> + <cell>97.91</cell> + <cell>3.39</cell> + </row> + + + <row> + <cell>D5pMAAAAcAAJ</cell> + <cell>fraktur_1_best</cell> + <cell>88</cell> + <cell>12</cell> + <cell>4557</cell> + <cell>546</cell> + <cell>93.22</cell> + <cell>97.8</cell> + <cell>4.58</cell> + </row> + + + <row> + <cell>8AQoAAAAYAAJ</cell> + <cell>fraktur_1_best</cell> + <cell>71</cell> + <cell>9</cell> + <cell>3130</cell> + <cell>434</cell> + <cell>94.93</cell> + <cell>97.7</cell> + <cell>2.77</cell> + </row> + + + <row> + <cell>Fy4JAAAAQAAJ</cell> + <cell>en_best</cell> + <cell>20</cell> + <cell>3</cell> + <cell>743</cell> + <cell>125</cell> + <cell>96.0</cell> + <cell>97.6</cell> + <cell>1.6</cell> + </row> + + + <row> + <cell>anneofgeierstein02scot</cell> + <cell>en_best</cell> + <cell>42</cell> + <cell>6</cell> + <cell>1747</cell> + <cell>204</cell> + <cell>98.04</cell> + <cell>97.55</cell> + <cell>-0.49</cell> + </row> + + + <row> + <cell>u4cnAAAAMAAJ</cell> + <cell>fraktur_1_best</cell> + <cell>76</cell> + <cell>10</cell> + <cell>3936</cell> + <cell>553</cell> + <cell>91.5</cell> + <cell>97.11</cell> + <cell>5.61</cell> + </row> + + + <row> + <cell>1VUJAAAAQAAJ</cell> + <cell>en_best</cell> + <cell>85</cell> + <cell>11</cell> + <cell>3899</cell> + <cell>455</cell> + <cell>94.73</cell> + <cell>96.7</cell> + <cell>1.97</cell> + </row> + + + <row> + <cell>quentindurward01scotuoft</cell> + <cell>en_best</cell> + <cell>20</cell> + <cell>3</cell> + <cell>708</cell> + <cell>86</cell> + <cell>95.35</cell> + <cell>95.35</cell> + <cell>0.0</cell> + </row> + + + <row> + <cell>4zQfAAAAMAAJ</cell> + <cell>fraktur_1_best</cell> + <cell>159</cell> + <cell>20</cell> + <cell>6817</cell> + <cell>932</cell> + <cell>87.98</cell> + <cell>94.74</cell> + <cell>6.76</cell> + </row> + + + <row> + <cell>7JVMAAAAcAAJ</cell> + <cell>fraktur_1_best</cell> + <cell>89</cell> + <cell>12</cell> + <cell>4604</cell> + <cell>616</cell> + <cell>65.91</cell> + <cell>94.32</cell> + <cell>28.41</cell> + </row> + + + <row> + <cell>YAZXAAAAcAAJ</cell> + <cell>fraktur_1_best</cell> + <cell>1752</cell> + <cell>219</cell> + <cell>66253</cell> + <cell>8327</cell> + <cell>80.17</cell> + <cell>93.61</cell> + <cell>13.44</cell> + </row> + + + <row> + <cell>8dAyAQAAMAAJ</cell> + <cell>fraktur_1_best</cell> + <cell>88</cell> + <cell>12</cell> + <cell>3448</cell> + <cell>380</cell> + <cell>87.11</cell> + <cell>93.42</cell> + <cell>6.31</cell> + </row> + + + <row> + <cell>PzMJAAAAQAAJ</cell> + <cell>en_best</cell> + <cell>61</cell> + <cell>8</cell> + <cell>2294</cell> + <cell>234</cell> + <cell>90.17</cell> + <cell>92.74</cell> + <cell>2.57</cell> + </row> + + + <row> + <cell>wggOAAAAQAAJ</cell> + <cell>en_best</cell> + <cell>19</cell> + <cell>3</cell> + <cell>716</cell> + <cell>94</cell> + <cell>91.49</cell> + <cell>92.55</cell> + <cell>1.06</cell> + </row> + + + <row> + <cell>WjMfAAAAMAAJ</cell> + <cell>fraktur_1_best</cell> + <cell>183</cell> + <cell>23</cell> + <cell>7363</cell> + <cell>814</cell> + <cell>71.62</cell> + <cell>91.52</cell> + <cell>19.9</cell> + </row> + + + <row> + <cell>MzQJAAAAQAAJ</cell> + <cell>en_best</cell> + <cell>36</cell> + <cell>5</cell> + <cell>1265</cell> + <cell>201</cell> + <cell>88.56</cell> + <cell>90.55</cell> + <cell>1.99</cell> + </row> + + + <row> + <cell>fAoOAAAAQAAJ</cell> + <cell>en_best</cell> + <cell>40</cell> + <cell>6</cell> + <cell>1675</cell> + <cell>121</cell> + <cell>86.78</cell> + <cell>87.6</cell> + <cell>0.82</cell> + </row> + + + <row> + <cell>kggOAAAAQAAJ</cell> + <cell>en_best</cell> + <cell>40</cell> + <cell>6</cell> + <cell>1572</cell> + <cell>243</cell> + <cell>82.72</cell> + <cell>82.72</cell> + <cell>0.0</cell> + </row> + + + <row> + <cell>oNEyAQAAMAAJ</cell> + <cell>fraktur_1_best</cell> + <cell>73</cell> + <cell>10</cell> + <cell>2874</cell> + <cell>386</cell> + <cell>68.39</cell> + <cell>79.02</cell> + <cell>10.63</cell> + </row> + + + <row> + <cell>htQwAQAAMAAJ</cell> + <cell>fraktur_1_best</cell> + <cell>78</cell> + <cell>10</cell> + <cell>3990</cell> + <cell>464</cell> + <cell>69.18</cell> + <cell>78.02</cell> + <cell>8.84</cell> + </row> + <trailer xml:id="tab02"> + <ref type="intern" target="#tab2">Tab. 2</ref>: Performance comparison of baseline model and fine-tuned + model for each document in our corpus. For almost all documents there is a large + improvement over the baseline even with a very limited number of fine-tuning + samples. The sum of lines and characters depicted in the table do not add up to the + numbers reported in the text because during training we used an additional split of + the data as an evaluation set that had the same size as the test set respectively. + [Lassner et al. 2021]<ref type="graphic" target="#ocr_2021_t2"/> + </trailer> + </table> + + + <table> + + <row> + <cell>Document + Group</cell> + <cell>baseline + model</cell> + <cell>Train # + lines</cell> + <cell>Test # + lines</cell> + <cell>Train # + chars</cell> + <cell>Test # + chars</cell> + <cell>baseline + character accuracy</cell> + <cell>fine-tuned + character accuracy</cell> + <cell>δ</cell> + </row> + + + <row> + <cell>English Antiqua</cell> + <cell>en_best</cell> + <cell>650</cell> + <cell>82</cell> + <cell>26793</cell> + <cell>3406</cell> + <cell>94.19</cell> + <cell>96.21</cell> + <cell>2.02</cell> + </row> + + + <row> + <cell>German Fraktur</cell> + <cell>fraktur_1_best</cell> + <cell>3449</cell> + <cell>432</cell> + <cell>145928</cell> + <cell>17577</cell> + <cell>85.89</cell> + <cell>95.99</cell> + <cell>10.1</cell> + </row> + <trailer xml:id="tab03"> + <ref type="intern" target="#tab3">Tab. 3</ref>: Performance comparison of baseline model and fine-tuned + model trained on a random splits of samples within the same group. [Lassner et al. 2021]<ref type="graphic" target="#ocr_2021_t3"/> + </trailer> + </table> + + <table> + + <row> + <cell>Left-out + identifier</cell> + <cell>baseline + model</cell> + <cell>Train # + lines</cell> + <cell>Test # + lines</cell> + <cell>Train # + chars</cell> + <cell>Test # + chars</cell> + <cell>baseline + character accuracy</cell> + <cell>fine-tuned + character accuracy</cell> + <cell>δ</cell> + </row> + + + <row> + <cell>chroniclesofcano03scot</cell> + <cell>en_best</cell> + <cell>686</cell> + <cell>50</cell> + <cell>28134</cell> + <cell>2182</cell> + <cell>99.22</cell> + <cell>99.59</cell> + <cell>0.37</cell> + </row> + + + <row> + <cell>H9UwAQAAMAAJ</cell> + <cell>fraktur_1_best</cell> + <cell>3794</cell> + <cell>96</cell> + <cell>159088</cell> + <cell>5130</cell> + <cell>96.74</cell> + <cell>99.57</cell> + <cell>2.83</cell> + </row> + + + <row> + <cell>aNQwAQAAMAAJ</cell> + <cell>fraktur_1_best</cell> + <cell>3822</cell> + <cell>65</cell> + <cell>161053</cell> + <cell>3397</cell> + <cell>97.0</cell> + <cell>99.53</cell> + <cell>2.53</cell> + </row> + + + <row> + <cell>chroniclesofcano02scot</cell> + <cell>en_best</cell> + <cell>709</cell> + <cell>25</cell> + <cell>29226</cell> + <cell>1017</cell> + <cell>99.02</cell> + <cell>99.51</cell> + <cell>0.49</cell> + </row> + + + <row> + <cell>zDTMtgEACAAJ</cell> + <cell>fraktur_1_best</cell> + <cell>3794</cell> + <cell>96</cell> + <cell>159131</cell> + <cell>5430</cell> + <cell>95.05</cell> + <cell>99.43</cell> + <cell>4.38</cell> + </row> + + + <row> + <cell>anneofgeierstein03scot</cell> + <cell>en_best</cell> + <cell>708</cell> + <cell>26</cell> + <cell>29144</cell> + <cell>1062</cell> + <cell>98.68</cell> + <cell>99.34</cell> + <cell>0.66</cell> + </row> + + + <row> + <cell>t88yAQAAMAAJ</cell> + <cell>fraktur_1_best</cell> + <cell>3786</cell> + <cell>105</cell> + <cell>160286</cell> + <cell>4181</cell> + <cell>91.13</cell> + <cell>99.28</cell> + <cell>8.15</cell> + </row> + + + <row> + <cell>anneofgeierstein02scot</cell> + <cell>en_best</cell> + <cell>684</cell> + <cell>53</cell> + <cell>28053</cell> + <cell>2181</cell> + <cell>98.3</cell> + <cell>99.27</cell> + <cell>0.97</cell> + </row> + + + <row> + <cell>DNUwAQAAMAAJ</cell> + <cell>fraktur_1_best</cell> + <cell>3794</cell> + <cell>96</cell> + <cell>159113</cell> + <cell>5228</cell> + <cell>95.26</cell> + <cell>99.01</cell> + <cell>3.75</cell> + </row> + + + <row> + <cell>D5pMAAAAcAAJ</cell> + <cell>fraktur_1_best</cell> + <cell>3780</cell> + <cell>111</cell> + <cell>159386</cell> + <cell>5660</cell> + <cell>93.69</cell> + <cell>99.01</cell> + <cell>5.32</cell> + </row> + + + <row> + <cell>3pVMAAAAcAAJ</cell> + <cell>fraktur_1_best</cell> + <cell>3777</cell> + <cell>115</cell> + <cell>158561</cell> + <cell>6036</cell> + <cell>94.68</cell> + <cell>98.99</cell> + <cell>4.31</cell> + </row> + + + <row> + <cell>zviTtwEACAAJ</cell> + <cell>fraktur_1_best</cell> + <cell>3806</cell> + <cell>83</cell> + <cell>159741</cell> + <cell>4384</cell> + <cell>95.76</cell> + <cell>98.97</cell> + <cell>3.21</cell> + </row> + + + <row> + <cell>8AQoAAAAYAAJ</cell> + <cell>fraktur_1_best</cell> + <cell>3800</cell> + <cell>89</cell> + <cell>160966</cell> + <cell>3926</cell> + <cell>94.7</cell> + <cell>98.9</cell> + <cell>4.2</cell> + </row> + + + <row> + <cell>1VUJAAAAQAAJ</cell> + <cell>en_best</cell> + <cell>635</cell> + <cell>107</cell> + <cell>25735</cell> + <cell>4839</cell> + <cell>96.88</cell> + <cell>98.8</cell> + <cell>1.92</cell> + </row> + + + <row> + <cell>AdiKyqdlp4cC</cell> + <cell>fraktur_1_best</cell> + <cell>3793</cell> + <cell>97</cell> + <cell>160065</cell> + <cell>3736</cell> + <cell>92.34</cell> + <cell>98.47</cell> + <cell>6.13</cell> + </row> + + + <row> + <cell>rDUJAAAAQAAJ</cell> + <cell>en_best</cell> + <cell>639</cell> + <cell>103</cell> + <cell>26265</cell> + <cell>4419</cell> + <cell>97.85</cell> + <cell>98.42</cell> + <cell>0.57</cell> + </row> + + + <row> + <cell>quentindurward02scotuoft</cell> + <cell>en_best</cell> + <cell>687</cell> + <cell>49</cell> + <cell>28274</cell> + <cell>2223</cell> + <cell>97.35</cell> + <cell>98.34</cell> + <cell>0.99</cell> + </row> + + + <row> + <cell>HCRMAAAAcAAJ</cell> + <cell>fraktur_1_best</cell> + <cell>3739</cell> + <cell>157</cell> + <cell>158250</cell> + <cell>6378</cell> + <cell>91.28</cell> + <cell>98.28</cell> + <cell>7.0</cell> + </row> + + + <row> + <cell>J4knAAAAMAAJ</cell> + <cell>en_best</cell> + <cell>708</cell> + <cell>26</cell> + <cell>29219</cell> + <cell>1089</cell> + <cell>97.15</cell> + <cell>98.07</cell> + <cell>0.92</cell> + </row> + + + <row> + <cell>2jMfAAAAMAAJ</cell> + <cell>fraktur_1_best</cell> + <cell>3703</cell> + <cell>197</cell> + <cell>155342</cell> + <cell>9181</cell> + <cell>92.43</cell> + <cell>98.04</cell> + <cell>5.61</cell> + </row> + + + <row> + <cell>XtEyAQAAMAAJ</cell> + <cell>fraktur_1_best</cell> + <cell>3783</cell> + <cell>108</cell> + <cell>160349</cell> + <cell>4322</cell> + <cell>87.69</cell> + <cell>97.59</cell> + <cell>9.9</cell> + </row> + + + <row> + <cell>quentindurward01scotuoft</cell> + <cell>en_best</cell> + <cell>708</cell> + <cell>26</cell> + <cell>29284</cell> + <cell>940</cell> + <cell>96.38</cell> + <cell>97.13</cell> + <cell>0.75</cell> + </row> + + + <row> + <cell>wggOAAAAQAAJ</cell> + <cell>en_best</cell> + <cell>710</cell> + <cell>24</cell> + <cell>29362</cell> + <cell>869</cell> + <cell>92.52</cell> + <cell>96.89</cell> + <cell>4.37</cell> + </row> + + + <row> + <cell>_QgOAAAAQAAJ</cell> + <cell>en_best</cell> + <cell>664</cell> + <cell>75</cell> + <cell>27117</cell> + <cell>3320</cell> + <cell>94.43</cell> + <cell>96.66</cell> + <cell>2.23</cell> + </row> + + + <row> + <cell>fAoOAAAAQAAJ</cell> + <cell>en_best</cell> + <cell>685</cell> + <cell>51</cell> + <cell>28128</cell> + <cell>2007</cell> + <cell>94.72</cell> + <cell>96.61</cell> + <cell>1.89</cell> + </row> + + + <row> + <cell>4zQfAAAAMAAJ</cell> + <cell>fraktur_1_best</cell> + <cell>3701</cell> + <cell>199</cell> + <cell>156399</cell> + <cell>8681</cell> + <cell>88.68</cell> + <cell>96.37</cell> + <cell>7.69</cell> + </row> + + + <row> + <cell>PzMJAAAAQAAJ</cell> + <cell>en_best</cell> + <cell>662</cell> + <cell>77</cell> + <cell>27724</cell> + <cell>2817</cell> + <cell>90.7</cell> + <cell>95.49</cell> + <cell>4.79</cell> + </row> + + + <row> + <cell>u4cnAAAAMAAJ</cell> + <cell>fraktur_1_best</cell> + <cell>3795</cell> + <cell>95</cell> + <cell>159827</cell> + <cell>4889</cell> + <cell>91.31</cell> + <cell>95.21</cell> + <cell>3.9</cell> + </row> + + + <row> + <cell>7JVMAAAAcAAJ</cell> + <cell>fraktur_1_best</cell> + <cell>3780</cell> + <cell>112</cell> + <cell>159080</cell> + <cell>5816</cell> + <cell>71.35</cell> + <cell>94.62</cell> + <cell>23.27</cell> + </row> + + + <row> + <cell>8dAyAQAAMAAJ</cell> + <cell>fraktur_1_best</cell> + <cell>3780</cell> + <cell>111</cell> + <cell>159841</cell> + <cell>4271</cell> + <cell>84.45</cell> + <cell>94.24</cell> + <cell>9.79</cell> + </row> + + + <row> + <cell>htQwAQAAMAAJ</cell> + <cell>fraktur_1_best</cell> + <cell>3792</cell> + <cell>98</cell> + <cell>158623</cell> + <cell>4996</cell> + <cell>88.42</cell> + <cell>94.14</cell> + <cell>5.72</cell> + </row> + + + <row> + <cell>YAZXAAAAcAAJ</cell> + <cell>fraktur_1_best</cell> + <cell>1909</cell> + <cell>2190</cell> + <cell>89328</cell> + <cell>82910</cell> + <cell>80.68</cell> + <cell>92.92</cell> + <cell>12.24</cell> + </row> + + + <row> + <cell>MzQJAAAAQAAJ</cell> + <cell>en_best</cell> + <cell>691</cell> + <cell>45</cell> + <cell>28714</cell> + <cell>1622</cell> + <cell>84.9</cell> + <cell>89.52</cell> + <cell>4.62</cell> + </row> + + + <row> + <cell>kggOAAAAQAAJ</cell> + <cell>en_best</cell> + <cell>685</cell> + <cell>51</cell> + <cell>28216</cell> + <cell>1983</cell> + <cell>85.64</cell> + <cell>87.56</cell> + <cell>1.92</cell> + </row> + + + <row> + <cell>Fy4JAAAAQAAJ</cell> + <cell>en_best</cell> + <cell>709</cell> + <cell>25</cell> + <cell>29424</cell> + <cell>943</cell> + <cell>78.9</cell> + <cell>85.15</cell> + <cell>6.25</cell> + </row> + + + <row> + <cell>oNEyAQAAMAAJ</cell> + <cell>fraktur_1_best</cell> + <cell>3798</cell> + <cell>92</cell> + <cell>160955</cell> + <cell>3589</cell> + <cell>66.31</cell> + <cell>84.79</cell> + <cell>18.48</cell> + </row> + <trailer xml:id="tab04"> + <ref type="intern" target="#tab4">Tab. 4</ref>: Model performance evaluated with a leave-one-out + strategy. Within each group (German Fraktur and English Antiqua), an individual + model is trained on all samples except from the left-out identifier on which the + model is tested afterwards. The performance of the fine-tuned model is improved in + each case, often by a large margin. [Lassner et al. 2021]<ref type="graphic" target="#ocr_2021_t4"/> + </trailer> + </table> + <p>In the third setting, we trained + multiple models within each group, always training on all books of that group except + one and using only the data of the left-out book for testing. In all settings, we + also report the performance of the off-the-shelf OCR model on the test set for + comparison.</p> + <p>As depicted in <ref type="graphic" target="#tab04">Table 4</ref>, the performance of fine tuning + improves character accuracy each time even for the held-out book. This shows that + the fine-tuned model indeed did not overfit on a specific book but captures patterns + of a specific script. We should note, that in some cases of the third experiment + different volumes occur as individual samples, for example, the second volume of + Anne of Geierstein by Scott was not held-out when tested for the third volume of + Anne of Geierstein. Scripts in different volumes are often more similar than scripts + of the same font type which might improve the outcome of this experiments in some + cases.</p> + <p>For all three experiments, the Kraken + OCR engine with a German Fraktur model and an English model was used as baselines. + They were provided by the maintainers of Kraken.<note type="footnote"> See + <ref type="bibliography" target="#kiessling_kraken_2019">Kiessling + 2019</ref>. For baselines and fine-tuning version 3.0.4 of the Kraken engine was + used that can be found at <ref target="https://github.com/mittagessen/kraken/releases/tag/3.0.4">kraken release 3.0.4</ref>, + <ref type="bibliography" target="#kiessling_kraken_2019">Kiessling + 2021</ref>.</note> + </p> + <p>In the context of the research project + for which this data set was created, the performance gain is especially relevant as + research shows that a certain level of OCR quality is needed in order to be able to + obtain meaningful results on downstream tasks. For example, Hamdi et al. show the + importance of OCR quality on the performance of Named Entity Recognition as a + downstream task.<note type="footnote"> + See + <ref type="bibliography" target="#hamdi_impact_2020">Hamdi + et al. 2020</ref>.</note> With additional cross training of sub-corpora we are + confident that we will be able to push the character accuracy beyond 95% on all test + sets that will enable us to perform translatorship attribution analysis.</p> + <p>More generally, the results show that + in a variety of settings, additional ground truth data will improve the OCR results. + This advocates strongly for the publication of a greater range of, and especially + more diverse, sets of open and reusable ground truth data for historical prints.</p> + <p>The data set we thus created and + published is open and reproducible following the described framework. It can serve + as a template for other OCR ground truth data set projects. It is therefore not only + relevant because it shows why the community should create additional data sets: it + also shows how to create the data sets and invites to new publications bound to + bring Digital Humanities research a step forward. </p> + <p>The data pairs are compatible with + other OCR ground truth data sets such as e. g. OCR-D<note type="footnote"> See + <ref type="bibliography" target="#baierer_ocr_2019">Baierer + et al. 2019</ref>.</note> or GT4HistOCR<note type="footnote"> See + <ref type="bibliography" target="#springmann_truth_2018">Springmann + et al. 2018</ref>.</note>. Using the established PAGE-XML standard enables + interoperability and reusability of the transcriptions. Using open licenses for the + source code and the data, and publishing releases at an institutional open data + repository ensures representativeness and durability.</p> + </div> + <div type="chapter"> + <head>6. Conclusion</head> + <p>The work we realized in order to + constitute the data set we need for our stylometric research provided not only a + ground truth data set, but also a systematic approach to the legal issues we + encountered in the extraction of information from the scanned books we rely on as a + primary source. While we have been successful at automating many work steps, + improvements could still be envisioned.</p> + <p>In future work, we would like to + enrich the links to the original resource with additional links to mirrors of the + resources in order to increase the persistence of the image sources, whenever + available also adding OCLC IDs as universal identifiers.<note type="footnote"> OCLC is a registry of IDs referencing items in libraries, see <ref target="https://www.worldcat.org/">worldcat.org</ref>, OCLC 2021.</note> We would also like + to look into ways to automate the download of the PDFs from Google Books, the + Internet Archive or CHIs. Also, we would like to extend the framework we proposed + here. It could serve for hybrid data sets with parts where the copyright for the + image data is unclear (then published as data set formula), and others with approved + image redistribution (which could then be published as a built data set). It could + be used for example for the datasets from Bayerische Staatsbibliothek and University + of Illinois Urbana-Champaign.</p> + <p>Finally, we would like to encourage + scholars to publish their OCR ground truth data set in a similarly open and + interoperable manner, thus making it possible to ultimately increase accessibility + to archives and libraries for everyone.</p> + </div> + <div type="chapter"> + <head>Acknowledgements</head> + <p>This work has been supported by the + German Federal Ministry for Education and Research as BIFOLD.</p> + </div> + <div type="chapter"> + <head>List of contracts</head> + <p>The contracts between</p> + <list type="unordered"> + <item>a number of US-based libraries and Google is available <ref target="https://web.archive.org/web/20120707144623/http:/thepublicindex.org/docs/libraries/cic.pdf">here</ref>,</item> + <item>the British Library and Google is available <ref target="https://www.openrightsgroup.org/app/uploads/2020/03/BL-Google-Contract.pdf">here</ref>,</item> + <item>the National Library of the Netherlands and Google is available <ref target="https://web.archive.org/web/20111025094345/http:/www.kb.nl/nieuws/2011/contract-google-kb.pdf">here</ref>, </item> + <item>the University of Michigan and Google is available <ref target="http://web.archive.org/web/20050906002322/https:/www.lib.umich.edu/mdp/um-google-cooperative-agreement.pdf">here</ref>, </item> + <item>the University of Texas at Austin and Google is available <ref target="https://web.archive.org/web/20151226021049/https:/www.lib.utexas.edu/sites/default/files/google/utexas_google_agreement.pdf">here</ref>,</item> + <item>the University of Virginia and Google is available <ref target="https://web.archive.org/web/20120707144748/http:/thepublicindex.org/docs/libraries/virginia.pdf">here</ref>,</item> + <item>Scanning Solutions (for the Bibliotheque Municipale de Lyon) and Google is + available <ref target="https://web.archive.org/web/20120707144718/http:/thepublicindex.org/docs/libraries/lyon_ae.pdf">here</ref>,</item> + <item>University of California and Google is available <ref target="https://web.archive.org/web/20120707144625/http:/thepublicindex.org/docs/libraries/california.pdf">here</ref>.</item> + </list> + </div> + </div> + <div type="bibliography"> + <head>Bibliographic references</head> + <listBibl> + <bibl xml:id="bachleiter_uebersetzungsfabriken_1989">Norbert + Bachleitner: »Übersetzungsfabriken«: das deutsche Übersetzungswesen in der ersten + Hälfte des 19. Jahrhunderts. In: Internationales Archiv für Sozialgeschichte der + deutschen Literatur 14 (1989), i. 1, pp. 1–50. <ptr type="gbv" cRef="129444383" + /></bibl> + <bibl xml:id="baillot_data_2016">Anne Baillot / Mike Mertens / Laurent Romary: Data fluidity + in DARIAH – pushing the agenda forward. In: Bibliothek Forschung und Praxis 39 + (2016), i. 3, pp. 350–357. DOI: <ref target="https://doi.org/10.1515/bfp-2016-0039">10.1515/bfp-2016-0039</ref> + <ptr type="gbv" cRef="12961193X" + /></bibl> + <bibl xml:id="baierer_ocr_2019">Konstantin Baierer / Matthias Boenig / Clemens Neudecker: + Labelling OCR Ground Truth for Usage in Repositories. In: Proceedings of the + International Conference on Digital Access to Textual Cultural Heritage (DATeCH2019: + 3, Brussels, 08.–10.05.2019) New York, NY 2019, pp. 3–8. <ptr type="gbv" cRef="1734515961" + /></bibl> + <bibl xml:id="htr_united_2021">HTR-United. In: GitHub.io. By Alix Chagué / Thibault + Clérice. 2021. [<ref target="https://htr-united.github.io/">online</ref>]</bibl> + <bibl xml:id="eclevia_data_2019">Marian Ramos Eclevia / John Christopher La Torre Fredeluces + / Carlos Jr Lagrosas Eclevia / Roselle Saguibo Maestro: What Makes a Data Librarian? + An Analysis of Job Descriptions and Specifications for Data Librarian. In: + Qualitative and Quantitative Methods in Libraries 8 (2019), n. 3, pp. 273–290. [<ref target="http://qqml-journal.net/index.php/qqml/article/view/541">online</ref>]</bibl> + <bibl xml:id="engl_volltexte_2020">Elisabeth Engl: Volltexte für die Frühe Neuzeit. Der + Beitrag des OCR-D-Projekts zur Volltexterkennung frühneuzeitlicher Drucke. In: + Zeitschrift für Historische Forschung 2 (2020), n. 47, pp. 223–250. <ptr type="gbv" cRef="129309338" + /></bibl> + <bibl xml:id="eth_transcriptiones_2020">Transcriptiones. A platform for hosting, accessing and + sharing transcripts of non-digitised historical manuscripts. Ed. by ETH-Library. + Zürich 2020. [<ref target="https://www.librarylab.ethz.ch/project/transcriptiones/">online</ref>]</bibl> + <bibl xml:id="hamdi_impact_2020">Ahmed Hamdi / Axel Jean-Caurant / Nicolas Sidère / Mickaël + Coustaty: Assessing and Minimizing the Impact of OCR Quality on Named Entity + Recognition. In: Digital libraries for open knowledge. + International Conference on Theory and Practice of Digital Libraries. (TPDL: 24, + Lyon, 25.–27.08.2020) Cham 2020, pp. 87–101. <ptr type="gbv" cRef="173775262X" + /></bibl> + <bibl xml:id="heritage_data_2021">The Heritage Data Reuse Charter. In: DARIAH.eu. 2021. + [<ref target="https://www.dariah.eu/activities/open-science/data-re-use/">online</ref>]</bibl> + + + + <bibl xml:id="google_informationen_2006">Informationen und Richtlinien. Ed. by Google Inc. In: Google Books. Walter Scott: + Großvater's Ezählungen aus der Geschichte von Frankreich. Ed. by Georg Nicolaus Bärmann. Neue Folge. Zweiter Theil. Zwickau 1831. Digitalisiert am 15.11.2006. PDF. + [<ref target="https://books.googleusercontent.com/books/content?req=AKW5QacqJ1ytah-8JsyWYKfgLVnZGMYKbDlV_xg2ynjx_aaepDsn3n6q0CnzACs-ZyfZHd6O2QajiTZGiS8jng4nnH5kyY3xFjFOMbcRxaq1KF15JPVAQl-6en4LlMhGvzXe13qX2haJnRTvVGDAUa4W9_JG8toPUCCfVbqL8TF-GshZr4L9EgHZ6W4g2xUGqbRJjAs0ImImKkWhSDTUi-8jGATaViIV5xgVreVUKA4lgwFYxhpesnqlPwpOIDkJW8w3m0ztj49FPsVRDx8aepxC39l-b1Apuw">online</ref>] + </bibl> + + <bibl xml:id="kiessling_kraken_2019">Benjamin Kiessling: Kraken – an Universal Text Recognizer + for the Humanities. In: Digital Humanities 2019 Conference papers. (DH2019, Utrecht, + 08.–12.07.2019) Utrecht 2019. [<ref target="https://dev.clariah.nl/files/dh2019/boa/0673.html">online</ref>]</bibl> + <bibl xml:id="kraken_git_2021">Kraken 3.0.4. In: GitHub.io. Ed. by Benjamin Kiessling. + 2021. [<ref target="https://github.com/mittagessen/kraken/releases/tag/3.0.4">online</ref>]</bibl> + <bibl xml:id="lassner_data_2021">David Lassner / Julius Coburger / Clemens Neudecker / Anne + Baillot: Data set of the paper »Publishing an OCR ground truth data set for reuse in + an unclear copyright setting«. In: zenodo.org. 2021. Version 1.1 from 07.05.2021. + DOI: <ref target="https://doi.org/10.5281/zenodo.4742068">10.5281/zenodo.4742068</ref> + </bibl> + <bibl xml:id="mets_loc_2021">METS. Metadata Encoding & Transmission Standard. Home. + Ed. by The Library of Congress. Washington D.C. 04.10.2021. [<ref target="http://www.loc.gov/standards/mets/">online</ref>]</bibl> + <bibl xml:id="liebl_newspapers_2020">Bernhard Liebl / Manuel Burghardt: From Historical + Newspapers to Machine-Readable Data: The Origami OCR Pipeline. In: Proceedings of + the Workshop on Computational Humanities Research. Ed. by Folgert Karsdorp / Barbara + McGillivray / Adina Nerghes / Melvin Wevers. (CHR2020, Amsterdam, 18.–20.11.2020), Aachen 2020, pp. 351–373. (= CEUR Workshop Proceedings, 2723) URN: <ref target="https://nbn-resolving.org/urn:nbn:de:0074-2723-3">urn:nbn:de:0074-2723-3</ref> + </bibl> + <bibl xml:id="ocr_data_2021">OCR-Data. In: GitHub.io. 2021. [<ref target="https://github.com/millawell/ocr-data">online</ref>]</bibl> + <bibl xml:id="ocrd_ocrd_2021">OCR-D. Specifications. In: OCR-D.de. Wolfenbüttel 2021. + [<ref target="https://ocr-d.de/en/spec/">online</ref>]</bibl> + <bibl xml:id="ocr_model_2021">OCR/HTR model repository. In: Zenodo.org. 2021. + [<ref target="https://zenodo.org/communities/ocr_models/?page=1&size=20">online</ref>]</bibl> + <bibl xml:id="worldcat_oclc_2021">WorldCat. Ed. by OCLC. Dublin 2021. [<ref target="https://www.worldcat.org/">online</ref>] + </bibl> + <bibl xml:id="padilla_report_2019">Thomas Padilla / Laurie Allen / Hannah Frost / Sarah Potvin + / Elizabeth Russey Roke / Stewart Varner: Final Report – Always Already + Computational: Collections as Data. In. zenodo.org. Version 1 from 22.05.2019. + DOI: <ref target="http://doi.org/10.5281/zenodo.3152935">10.5281/zenodo.3152935</ref> + </bibl> + <bibl xml:id="pletschacher_page_2010">Stefan Pletschacher / Apostolos Antonacopoulos: The PAGE + (Page Analysis and Ground-Truth Elements) Format Framework. In: Proceedings of the + 20th International Conference on Pattern Recognition. Ed. by IEEE. (ICPR: 20, + Istanbul, 23.–26.08.2010) Piscataway, NJ 2010, vol. 1, pp. 257–260. <ptr type="gbv" cRef="639567843"/></bibl> + + <bibl xml:id="readcoop_models_2021">Public AI models in Transkribus. Ed. bsy READ-COOP. + Innsbruck 2021. [<ref target="https://readcoop.eu/transkribus/public-models/">online</ref>]</bibl> + + <bibl xml:id="reul_learning_2017">Christian Reul / Christoph Wick / Uwe Springmann / Frank + Puppe: Transfer Learning for OCRopus Model Training on Early Printed Books. In: + Zeitschrift für Bibliothekskultur 5 (2017), i. 1, pp. 32–45. In: zenodo.org. Version + 1 from 22.12.2017. DOI: <ref target="https://doi.org/10.5281/zenodo.4705364">10.5281/zenodo.4705364</ref> + </bibl> + <bibl xml:id="ruiz_agreement_2011">Javier Ruiz: Access to the Agreement between Google Books + and the British Library. In: Open Rights Group. Ed. by The Society of Authors. + Blogpost from 24.08.2011. + [<ref target="https://www.openrightsgroup.org/blog/access-to-the-agreement-between-google-books-and-the-british-library/">online</ref>] </bibl> + + <bibl xml:id="schoech_textformate_2020">Christof Schöch / Frédéric Döhl / Achim rettinger / Evely + Gius / Peer Trilcke / Peter Leinen / Fotis Jannidis / Maria Hinzmann / Jörg Röpke: + Abgeleitete Textformate: Text und Data Mining mit urheberrechtlich geschützten + Textbeständen. In: Zeitschrift für digitale Geisteswissenschaften 5 (2020). DOI: + <ref target="http://doi.org/10.17175/2020_006">10.17175/2020_006</ref></bibl> + + + + <bibl xml:id="springmann_truth_2018">Uwe Springmann / Christian Reul / Stefanie Dipper / + Johannes Balter: Ground Truth for training OCR engines on historical documents in + German Fraktur and Early Modern Latin. In: The Journal for Language Technology and + Computational Linguistics 33 (2018), i. 1, pp. 97–114. PDF. [<ref target="https://jlcl.org/content/2-allissues/2-heft1-2018/jlcl_2018-1_5.pdf">online</ref>] + </bibl> + </listBibl> + </div> + <div type="abbildungsnachweis"> + <head>List of Figures with + Captions</head> + <desc type="table" xml:id="tab1"><ref target="#tab01" type="intern">Tab. 1</ref>: Responses of library + institutions to our request to grant permission to publish excerpts of the scans + for which they were contractors of the digitization. Most institutions responded + within a few working days and except for the fact that most acknowledged the + public domain of the items, the responses were very diverse. Many answered that + they are either not responsible or only responsible for their Library Copy of + the PDF. [Lassner et al. 2021] + <ref type="graphic" target="#ocr_2021_t1"/></desc> + <desc type="graphic" xml:id="abb1">Excerpt of a METS file as + used in our data set. For each book, we created one METS file. The link to the + resource contains the identifier and the page number. [<ref type="bibliography" target="#lassner_data_2021">Lassner + et al. 2021</ref>]<ref type="graphic" target="#ocr_2021_001" /></desc> + <desc type="graphic" xml:id="abb2">Excerpt from the PAGE + file showing the bounding box of the line on the page image and the + corresponding text string. [<ref type="bibliography" target="#lassner_data_2021">Lassner + et al. 2021</ref>]<ref type="graphic" target="#ocr_2021_002"/></desc> + <desc type="graphic" xml:id="abb3">Excerpt from the METS + file as used in our data set. For each book, we created one METS file. This part + of the METS file contains the references to the PAGE files. [<ref type="bibliography" target="#lassner_data_2021">Lassner + et al. 2021</ref>]<ref type="graphic" target="#ocr_2021_003"/></desc> + <desc type="graphic" xml:id="abb4">Excerpt from the METS + file as used in our data set. For each book, we created one METS file. Together + with the links to the image resources shown in <ref type="graphic" target="#ocr_2021_001">Figure 1</ref>, and the links to the + PAGE files, the METS file holds the connection between the text lines and the + page images. [<ref type="bibliography" target="#lassner_data_2021">Lassner + et al. 2021</ref>]<ref type="graphic" target="#ocr_2021_004"/></desc> + <desc type="table" xml:id="tab2"><ref target="#tab02" type="intern">Tab. 2</ref>: Performance comparison of + baseline model and fine-tuned model for each document in our corpus. For almost + all documents there is a large improvement over the baseline even with a very + limited number of fine-tuning samples. The sum of lines and characters depicted + in the table do not add up to the numbers reported in the text because during + training we used an additional split of the data as an evaluation set that had + the same size as the test set respectively. [Lassner et al. 2021]<ref type="graphic" target="#ocr_2021_t2"/></desc> + <desc type="table" xml:id="tab3"><ref target="#tab03" type="intern">Tab. 3</ref>: Performance comparison of + baseline model and fine-tuned model trained on a random splits of samples within + the same group. [Lassner et al. 2021]<ref type="graphic" target="#ocr_2021_t3"/></desc> + <desc type="table" xml:id="tab4"><ref target="#tab024" type="intern">Tab. 4</ref>: Model performance + evaluated with a leave-one-out strategy. Within each group (German Fraktur and + English Antiqua), an individual model is trained on all samples except from the + left-out identifier on which the model is tested afterwards. The performance of + the fine-tuned model is improved in each case, often by a large margin. [Lassner + et al. 2021]<ref type="graphic" target="#ocr_2021_t4"/></desc> + + </div> + </body> + </text> +</TEI>