diff --git a/SB05_007_lassner_et_al/ocr_2021_001.png b/SB05_007_lassner_et_al/ocr_2021_001.png
new file mode 100644
index 0000000000000000000000000000000000000000..fc0bafcdd33f6687d678c038ecd03a89bd6b3c10
Binary files /dev/null and b/SB05_007_lassner_et_al/ocr_2021_001.png differ
diff --git a/SB05_007_lassner_et_al/ocr_2021_002.png b/SB05_007_lassner_et_al/ocr_2021_002.png
new file mode 100644
index 0000000000000000000000000000000000000000..e575dce9d74dbb5f78fd0a498cb49bf44e292e47
Binary files /dev/null and b/SB05_007_lassner_et_al/ocr_2021_002.png differ
diff --git a/SB05_007_lassner_et_al/ocr_2021_003.png b/SB05_007_lassner_et_al/ocr_2021_003.png
new file mode 100644
index 0000000000000000000000000000000000000000..70840efbc75433e56e0c6ab7a8ce317ee4a98fb2
Binary files /dev/null and b/SB05_007_lassner_et_al/ocr_2021_003.png differ
diff --git a/SB05_007_lassner_et_al/ocr_2021_004.png b/SB05_007_lassner_et_al/ocr_2021_004.png
new file mode 100644
index 0000000000000000000000000000000000000000..71b4e4c36ca65baa6e942edf24669b816e4f7e98
Binary files /dev/null and b/SB05_007_lassner_et_al/ocr_2021_004.png differ
diff --git a/SB05_007_lassner_et_al/ocr_2021_v1_0.pdf b/SB05_007_lassner_et_al/ocr_2021_v1_0.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..ffe586ddf569f34cfa052b75f18bec5d86488646
Binary files /dev/null and b/SB05_007_lassner_et_al/ocr_2021_v1_0.pdf differ
diff --git a/SB05_007_lassner_et_al/ocr_2021_v1_0.xml b/SB05_007_lassner_et_al/ocr_2021_v1_0.xml
new file mode 100644
index 0000000000000000000000000000000000000000..5f724f478169cb1065081a46058072952942f61e
--- /dev/null
+++ b/SB05_007_lassner_et_al/ocr_2021_v1_0.xml
@@ -0,0 +1,2214 @@
+<?xml version="1.0" encoding="utf-8"?>
+<?xml-model href="https://www.zfdg.de/sites/default/files/schema/tei_zfdg.rnc" type="application/relax-ng-compact-syntax"
+    ?>
+<TEI xmlns="http://www.tei-c.org/ns/1.0"
+     xmlns:html="http://www.w3.org/1999/html"
+     xmlns:tei="http://www.tei-c.org/ns/1.0"
+     xmlns:xlink="http://www.w3.org/1999/xlink"
+     xmlns:xhtml="http://www.w3.org/1999/xhtml">
+   <teiHeader>
+      <fileDesc>
+         <titleStmt>
+            <title>
+               <biblStruct>
+                  <analytic>
+                     <title level="a">Publishing an OCR ground truth data set for reuse in an unclear
+                copyright setting. Two case studies with legal and
+                technical solutions to enable a collective OCR ground truth data set effort</title>
+                     
+                    <respStmt>
+                      <resp>
+                        <persName>
+                          <name role="marc_aut">
+                            <forename>David</forename>
+                            <surname>Lassner</surname>
+                          </name>
+                          <email>lassner@tu-berlin.de</email>
+                          <idno type="gnd">1246941414</idno>
+                          <idno type="orcid">0000-0001-9013-0834</idno>
+                        </persName>
+                      </resp>
+                      <orgName>Technische UniversitÃ¤t Berlin, Machine Learning Group | The Berlin Institute for the Foundations of Learning and Data (BIFOLD)</orgName>
+                    </respStmt>
+                    <respStmt>
+                      <resp>
+                        <persName>
+                          <name role="marc_aut">
+                            <forename>Julius</forename>
+                            <surname>Coburger</surname>
+                          </name>
+                          <email>julius.coburger@gmx.de</email>
+                          <idno type="gnd">124694197X</idno>
+                          <idno type="orcid">0000-0003-4502-7955</idno>
+                        </persName>
+                      </resp>
+                      <orgName>Technische UniversitÃ¤t Berlin, Machine Learning Group</orgName>
+                    </respStmt>
+                    <respStmt>
+                      <resp>
+                        <persName>
+                          <name role="marc_aut">
+                            <forename>Clemens</forename>
+                            <surname>Neudecker</surname>
+                          </name>
+                          <email>clemens.neudecker@sbb.spk-berlin.de</email>
+                          <idno type="gnd">1246943069</idno>
+                          <idno type="orcid">0000-0001-5293-8322</idno>
+                        </persName>
+                      </resp>
+                      <orgName>Staatsbibliothek zu Berlin â€“ PreuÃŸischer Kulturbesitz</orgName>
+                    </respStmt>
+                    <respStmt>
+                      <resp>
+                        <persName>
+                          <name role="marc_aut">
+                            <forename>Anne</forename>
+                            <surname>Baillot</surname>
+                          </name>
+                          <email>anne.baillot@univ-lemans.fr</email>
+                          <idno type="gnd">1065904681</idno>
+                          <idno type="orcid">0000-0002-4593-059X</idno>
+                        </persName>
+                      </resp>
+                      <orgName>Le Mans UniversitÃ© | Ã‰cole normale supÃ©rieure de Lyon, Interactions, Corpus, Apprentissages, ReprÃ©sentations - ICAR</orgName>
+                    </respStmt>
+                     <idno type="doi">10.17175/sb005_006</idno>
+                    <idno type="ppn">1780168195</idno>
+                     <idno type="zfdg">2021.006</idno>
+                     <idno type="url">https://www.zfdg.de/node/340</idno>
+                     <date when="2021-09-08">10.12.2021</date>
+                  </analytic>
+                  <monogr>
+                     <title level="j">Zeitschrift fÃ¼r digitale Geisteswissenschaften</title>
+                     <title level="m">Sonderband: Fabrikation von Erkenntnis â€“ Experimente in den Digital Humanities.</title>
+                     <respStmt>
+                        <resp>
+                           <name role="marc_edt">
+                              <forename>Manuel</forename>
+                              <surname>Burghardt</surname>
+                           </name>
+                           <email>burghardt@informatik.uni-leipzig.de</email>
+                           <idno type="gnd">1237665523</idno>
+                           <idno type="orcid">0000-0003-1354-9089</idno>
+                        </resp>
+                        <orgName>UniversitÃ¤t Leipzig</orgName>
+                     </respStmt>
+                     <respStmt>
+                        <resp>
+                           <name role="marc_edt">
+                              <forename>Lisa</forename>
+                              <surname>Dieckmann</surname>
+                           </name>
+                           <email>lisa.dieckmann@uni-koeln.de</email>
+                           <idno type="gnd">1077268289</idno>
+                           <idno type="orcid">0000-0002-1708-7371</idno>
+                        </resp>
+                        <orgName>UniversitÃ¤t zu KÃ¶ln</orgName>
+                     </respStmt>
+                     <respStmt>
+                        <resp>
+                           <name role="marc_edt">
+                              <surname>Timo</surname>
+                              <forename>Steyer</forename>
+                           </name>
+                           <email>t.steyer@tu-braunschweig.de</email>
+                           <idno type="gnd">1053806175</idno>
+                           <idno type="orcid">0000-0003-0218-2269</idno>
+                        </resp>
+                        <orgName>Technische UniversitÃ¤t Braunschweig</orgName>
+                     </respStmt>
+                     <respStmt>
+                        <resp>
+                           <name role="marc_edt">
+                              <forename>Peer</forename>
+                              <surname>Trilcke</surname>
+                           </name>
+                           <email>trilcke@uni-potsdam.de</email>
+                           <idno type="gnd">139145117</idno>
+                           <idno type="orcid">0000-0002-1421-4320</idno>
+                        </resp>
+                        <orgName>UniversitÃ¤t Potsdam</orgName>
+                     </respStmt>
+                     <respStmt>
+                        <resp>
+                           <name role="marc_edt">
+                              <forename>Niels-Oliver</forename>
+                              <surname>Walkowski</surname>
+                           </name>
+                           <email>niels-oliver.walkowski@uni.lu</email>
+                           <idno type="gnd">1023378671</idno>
+                           <idno type="orcid">0000-0003-3043-3010</idno>
+                        </resp>
+                        <orgName>UniversitÃ¤t Luxemburg</orgName>
+                     </respStmt>
+                     <respStmt>
+                        <resp>
+                           <name role="marc_edt">
+                              <forename>JoÃ«lle</forename>
+                              <surname>Weis</surname>
+                           </name>
+                           <email>weis@hab.de</email>
+                           <idno type="gnd">1233399721</idno>
+                           <idno type="orcid">0000-0002-0080-4362</idno>
+                        </resp>
+                        <orgName>Forschungsverbund Marbach Weimar WolfenbÃ¼ttel</orgName>
+                     </respStmt>
+                     <respStmt>
+                        <resp>
+                           <name role="marc_edt">
+                              <forename>Ulrike</forename>
+                              <surname>Wuttke</surname>
+                           </name>
+                           <email>wuttke@fhpotsdam.de</email>
+                           <idno type="gnd">1107808405</idno>
+                           <idno type="orcid">0000-0002-8217-4025</idno>
+                        </resp>
+                        <orgName>Fachhochschule Potsdam</orgName>
+                     </respStmt>
+                     <respStmt>
+                        <resp>Publiziert von</resp>
+                        <orgName role="marc_pbl">Herzog August Bibliothek</orgName>
+                     </respStmt>
+                     <respStmt>
+                        <resp>Transformation der Word Vorlage nach TEI</resp>
+                        <name role="marc_trc">
+                           <surname>Baumgarten</surname>
+                           <forename>Marcus</forename>
+                           <idno type="gnd">1192832655</idno>
+                        </name>
+                     </respStmt>
+                     <availability status="free">
+                        <p>Available at <ref target="http://www.zfdg.de">https://www.zfdg.de</ref>
+                        </p>
+                     </availability>
+                     <biblScope unit="sonderband">5</biblScope>
+                     <biblScope unit="artikel">6</biblScope>
+                  </monogr>
+               </biblStruct>
+            </title>
+         </titleStmt>
+         <editionStmt>
+            <edition>Elektronische Ausgabe nach TEI P5</edition>
+         </editionStmt>
+         <publicationStmt>
+            <distributor>
+               <name>
+                  <orgName>Herzog August Bibliothek WolfenbÃ¼ttel</orgName>
+               </name>
+            </distributor>
+            <idno type="doi">10.17175/sb005</idno>
+            <idno type="ppn">1764792149</idno>
+            <idno type="url">https://www.zfdg.de/sonderband/5</idno>
+            <date when="2021-09-19">2021</date>
+            <authority>
+               <name>Herzog August Bibliothek</name>
+               <address>
+                  <addrLine/>
+               </address>
+            </authority>
+            <authority>
+               <name>Forschungsverbund MWW</name>
+               <address>
+                  <addrLine/>
+               </address>
+            </authority>
+            <availability status="free">
+               <p> Sofern nicht anders angegeben </p>
+               <licence target="http://creativecommons.org/licenses/by/4.0/">CC BY SA
+                                4.0</licence>
+            </availability>
+            <availability status="free">
+               <p> Available at <ref target="workID">https://www.zfdg.de"&gt; (c)
+                                Forschungsverbund MWW</ref>
+               </p>
+            </availability>
+         </publicationStmt>
+         <sourceDesc>
+            <p>Einreichung zum Call for Publications im Rahmen der vDHd21.</p>
+         </sourceDesc>
+      </fileDesc>
+      <encodingDesc>
+         <editorialDecl>
+            <p>Transformation der WORD-Vorlage nach XML/TEI-P5 durch die Oxgarage und eigenen
+                            XSLT; Lektorat des Textes durch die Herausgeber*innen und die Redaktion der ZfdG.</p>
+            <p>Medienrechte liegen bei den Autor*innen.</p>
+            <p>All links checked<date when="2021">02.12.2021</date>
+            </p>
+         </editorialDecl>
+      </encodingDesc>
+      <profileDesc>
+         <creation>Einreichung fÃ¼r den Sonderband 5 der Zeitschrift fÃ¼r digitale
+                        Geisteswissenschaften.</creation>
+         <langUsage>
+            <language ident="de">Text in Deutsch</language>
+            <language ident="de">Abstract in Deutsch</language>
+            <language ident="en">Abstract in Englisch</language>
+         </langUsage>
+         <textClass>
+            <keywords scheme="gnd">
+              <term>Informatik<ref target="4026894-9"/>
+              </term>
+              <term>Maschinelles Lernen<ref target="4193754-5"/>
+               </term>
+              <term>Optische Zeichenerkennung<ref target="4310936-6"/>
+               </term>            
+              <term>Urheberrecht<ref target="4062127-3"/>
+               </term>
+            </keywords>
+         </textClass>
+      </profileDesc>
+      <revisionDesc>
+         <change/>
+      </revisionDesc>
+   </teiHeader>
+   <text>
+      <body>
+         <div>
+            <div type="abstract">
+               <argument xml:lang="de">
+                  <p>In dieser Arbeit stellen wir einen OCR-Trainingsdatensatz fÃ¼r
+                historische Drucke vor und zeigen, wie sich im Vergleich zu unspezifischen Modellen
+                die Erkennungsgenauigkeit verbessert, wenn sie mithilfe dieser Daten weitertrainiert
+                werden. Wir erÃ¶rtern die Nachnutzbarkeit dieses Datensatzes anhand von zwei
+                Experimenten, die die rechtliche Grundlage zur VerÃ¶ffentlichung digitalisierter
+                Bilddateien am Beispiel von deutschen und englischen BÃ¼chern des 19. Jahrhunderts
+                betrachten. Wir prÃ¤sentieren ein Framework, mit dem OCR-TrainingsdatensÃ¤tze
+                verÃ¶ffentlicht werden kÃ¶nnen, auch wenn die Bilddateien nicht zur
+                WiederverÃ¶ffentlichung freigegeben sind.</p>
+               </argument>
+            </div>
+            <div type="abstract">
+               <argument xml:lang="en">
+                  <p>We present an OCR ground truth data set for historical prints
+                and show improvement of recognition results over baselines with training on this
+                data. We reflect on reusability of the ground truth data set based on two
+                experiments that look into the legal basis for reuse of digitized document images in
+                the case of 19th century English and German books. We propose a framework for
+                publishing ground truth data even when digitized document images cannot be easily
+                redistributed. </p>
+               </argument>
+            </div>
+            <div type="chapter">
+               <head>1. Introduction</head>            
+            <p>Digital access to Cultural Heritage is
+                a key challenge for todayâ€™s society. It has been improved by <term type="dh">Optical Character Recognition</term> (OCR), which is the
+                task by which a computer program extracts text from a digital image in order to draw
+                the text from that image and present it in a machine-readable form. For historical
+                prints, off-the-shelf OCR solutions often result in inaccurate readings. Another
+                impediment to accessing digitized cultural heritage data consists in the fact that
+                cultural heritage institutions provide online access to massive amounts of digitized
+                images of historical prints that have not been (or have been poorly) OCRed.
+                Solutions to improve this situation would benefit a wide range of actors, be they
+                scholars or a general audience. Many actors would indeed profit greatly from methods
+                conceived to extract high quality machine-readable text from images.</p>
+            <p>The results of an OCR method can be
+                improved significantly by using a pre-trained model and fine-tuning it on only a few
+                samples that display similar characteristics.<note type="footnote"> See
+                  <ref type="bibliography" target="#liebl_newspapers_2020">Liebl
+                    / Burghardt 2020</ref>; <ref type="bibliography" target="#reul_learning_2017">Reul et al. 2017</ref>; 
+                      <ref type="bibliography" target="#springmann_truth_2018">Springmann et al. 2018</ref>.</note> To
+                that end, there has been a growing effort from the Digital Humanities community to
+                create and publish data sets for specific historical periods, languages and
+                typefaces aiming at enabling scholars to fine-tune OCR models for their collection
+                of historical documents.<note type="footnote"> See <ref type="bibliography" target="#padilla_report_2019">Padilla et al. 2019</ref>. For manuscripts, just recently the Transcriptiones platform launched, see
+                  <ref target="https://www.librarylab.ethz.ch/project/transcriptiones/">transcriptiones</ref>, <ref type="bibliography" target="#eth_transcriptiones_2020">ETH-Library
+                            2020</ref>. For French texts from the 18th to the 21st century there exists HTR-United, see 
+                  <ref target="https://htr-united.github.io/">htr-united</ref>, <ref type="bibliography" target="#htr_united_2021">ChaguÃ©
+                        / ClÃ©rice 2021</ref>. The slightly different approach
+                        of just publishing fine-tuned models for different settings is proposed by
+                        Transkribus, see <ref target="http://transkribus.eu/wiki/images/d/d6/Public_Models_in_Transkribus.pdf">Transkribus</ref>, 
+                  <ref type="bibliography" target="#readcoop_models_2021">READ-COOP
+                    2021</ref>, or <ref type="bibliography" target="#kraken_git_2021">Kraken 2021</ref> 
+                  <ref target="https://zenodo.org/communities/ocr_models/">ocr_models</ref>, <ref type="bibliography" target="#ocr_model_2021">OCR/HTR model
+                        repository 2021</ref>.</note> In Germany, the DFG-funded OCR-D initiative
+                brings together major research libraries with the goal to create an open source
+                framework for the OCR of historical printed documents, including specifications and
+                guidelines for OCR ground truths.<note type="footnote"> See
+                  <ref type="bibliography" target="#engl_volltexte_2020">Engl
+                        2020</ref>.</note>
+            </p>
+            <p>In order to improve OCR results,
+                images and the corresponding transcriptions are collected in such a way that each
+                pair (image and text) only represents one line of text from the original page. This
+                is called a ground truth data set and is precisely what we will focus on in the
+                following.</p>
+            <p>Besides the fact that creating
+                transcriptions of images manually is tedious work, another major issue arises from
+                this type of collective effort in that the institutions that produce the scan often
+                claim some form of copyright to it. For example, on the first page of any of their
+                PDFs, Google Books <quote>[â€¦] request[s] that you use these files for
+                  personal, non-commercial purposes</quote><note type="footnote"> <ref type="bibliography" target="#google_information_2021">Google Inc. 2021</ref>, cited after <ref type="bibliography" target="#ruiz_agreement_2011">Ruiz 2011</ref>.</note>. As a consequence, a scholar aiming to create an OCR
+                ground truth data set would not know with certainty whether the rights to
+                redistribute the textline images derived from the PDF can be considered as
+                granted.</p>
+            <p>In this paper, we present an OCR
+                ground truth data set with an unclear copyright setting for the image data. We
+                discuss the legal background, show the relevance of the data set and provide
+                in-depth analysis of its constitutiq on and reuse by investigating two different
+                approaches to overcome the copyright issues.</p>
+            <p>In order to address these issues, we
+                compare in the following two ways to publish the OCR ground truth data set with
+                image data. </p>
+            <list type="unordered">
+                <item>As Google Books works with cultural heritage institutions (CHIs) to digitize
+                    books, we asked permission from the CHIs to redistribute the image data. </item>
+                <item>We published a data set formula, which consists of the transcriptions, links
+                    to the image sources, and a description on how to build the data set. For this
+                    process, we provide a fast, highly automated framework that enables others to
+                    reproduce the data set. </item>
+            </list>
+            </div>
+            <div type="chapter">
+               <head>2. Legal background
+                and its interpretation at CHIs</head>
+            <p>Clarifying the copyright situation for
+                the scans of a book collection requires to take into account, for each book, the
+                cultural heritage institution owning the book (usually a library), and, in the case
+                of private-public partnerships, also the scanning institution (e. g. Google Books)
+                involved in its digitization. For Google Books, there exist different contracts
+                between CHIs and Google, and not all of them are open to public inspection. However,
+                based on comparing the ones that are available, we assume that other contracts are
+                to some extent similar (see <ref type="intern" target="#hd16">List of Contracts</ref>). The
+                contracts contain information on the â€ºLibrary Digital Copyâ€¹ for which non-profit
+                uses are defined under Section 4.8 (cf. British Library Google Contract), which
+                states that a </p>
+            <p>
+               <quote type="grosszitat">Library may provide all or any
+                    portion of the Library Digital Copy, that is [...] a Digital Copy of a Public
+                    Domain work to (a) academic institutions or research libraries, or (b) when
+                    requested by Library and agreed upon in writing by Google, other not-for-profit
+                    or government entities that are not providing search or hosting services
+                    substantially similar to those provided by Google.</quote><note type="footnote"> British Library Google Books Agreement in <ref type="bibliography" target="#ruiz_agreement_2011">Ruiz
+                2011.</ref></note>
+            </p>
+            <p>When trying to unpack this legal
+                information against the use case presented here, multiple questions arise. What are
+                the legal possibilities for individual scholars regarding the use of the Library
+                Digital Copy of a Public Domain work? How can there be limitations in the use of a
+                Public Domain work? Is the use case of OCR model training substantially similar to
+                any search or hosting services provided by Google? Would and can libraries act as
+                brokers in negotiating written agreements about not-for-profit use with Google?</p>
+            <p>In the continuation of Section 4.8,
+                additional details are specified with regard to data redistribution by â€ºAdditional
+                institutionsâ€¹ where </p>
+            <p>
+               <quote type="grosszitat">[a written agreement with
+                    Google] will prohibit such Additional institution from redistributing [...]
+                    portions of the Library Digital Copy to other entities (beyond providing or
+                    making content available to scholars and other users for educational or research
+                    purposes.</quote><note type="footnote">
+                      British Library Google Books Agreement in <ref type="bibliography" target="#ruiz_agreement_2011">Ruiz 2011</ref>.</note>
+            </p>
+            <p>This brings up further questions but
+                also opens the perspective a bit, since there appear to be exceptions for <quote>scholars and other users for educational or research
+                    purposes</quote><note type="footnote">
+                      British Library Google Books Agreement in <ref type="bibliography" target="#ruiz_agreement_2011">Ruiz 2011</ref>.</note>, which is a precise fit of the use case we
+                present here. Now what does this mean in practice? Digital Humanities scholars are
+                not necessarily legal experts, so how do libraries that have entered
+                public-private-partnerships with Google for digitization of Public Domain works
+                implement these constraints? SchÃ¶ch et al. discuss a wide range of use cases in the
+                area of text and data mining with copyright protected digitized documents, but they
+                do not cover the creation and distribution of ground truth.<note type="footnote"> See
+                  <ref type="bibliography" target="#schoech_textformate_2020">SchÃ¶ch
+                        et al. 2020</ref>.</note> In other scenarios that involve copyrighted texts
+                published in derived formats, one question typically preventing redistribution is
+                whether it is possible to re-create the (copyright-protected) work from the derived
+                parts. In the case of textline ground truth, it is however likely that this would
+                constitute a violation of such a principle. In this unclear setting, scholars are in
+                need of support and guidance by CHIs.</p>
+            <table>               
+                  <row>
+                    <cell>Institution</cell>
+                    <cell>Total # books</cell>
+                    <cell>Total # pages</cell>
+                    <cell>Response time (# working days)</cell>
+                    <cell>Allowed to publish as part of the
+                        paper</cell>
+                    <cell>Allowed to license</cell>
+                    <cell>Alternative source</cell>
+                    <cell>Responsible</cell>
+                    <cell>Citation needed</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>Bayerische Staatsbibliothek</cell>
+                    <cell>4</cell>
+                    <cell>12</cell>
+                    <cell>3</cell>
+                    <cell>yes</cell>
+                    <cell>yes</cell>
+                    <cell>yes</cell>
+                    <cell>yes</cell>
+                    <cell>yes</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>Biblioteca Statale Isontina Gorizia</cell>
+                    <cell>1</cell>
+                    <cell>3</cell>
+                    <cell>â€“</cell>
+                    <cell>â€“</cell>
+                    <cell>â€“</cell>
+                    <cell>â€“</cell>
+                    <cell>â€“</cell>
+                    <cell>â€“</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>Bodleian Library</cell>
+                    <cell>11</cell>
+                    <cell>20</cell>
+                    <cell>2</cell>
+                    <cell>yes,
+                            alternative</cell>
+                    <cell>already CC-BY-NC</cell>
+                    <cell>yes</cell>
+                    <cell>yes</cell>
+                    <cell>yes</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>British Library</cell>
+                    <cell>1</cell>
+                    <cell>35</cell>
+                    <cell>4</cell>
+                    <cell>no</cell>
+                    <cell>no</cell>
+                    <cell>no</cell>
+                    <cell>yes</cell>
+                    <cell>â€“</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>Harvard University, Harvard College Library</cell>
+                    <cell>1</cell>
+                    <cell>3</cell>
+                    <cell>0</cell>
+                    <cell>yes</cell>
+                    <cell>yes</cell>
+                    <cell>yes</cell>
+                    <cell>no</cell>
+                    <cell>yes</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>New
+                            York Public Library</cell>
+                    <cell>5</cell>
+                    <cell>29</cell>
+                    <cell>3</cell>
+                    <cell>â€“</cell>
+                    <cell>â€“</cell>
+                    <cell>no</cell>
+                    <cell>no</cell>
+                    <cell>no</cell>
+                  </row>
+               
+              
+                  <row>
+                    <cell>Austrian National Library</cell>
+                    <cell>2</cell>
+                    <cell>6</cell>
+                    <cell>10</cell>
+                    <cell>yes,
+                            alternative</cell>
+                    <cell>no</cell>
+                    <cell>yes</cell>
+                    <cell>yes</cell>
+                    <cell>yes</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>Robarts â€“ University of Toronto</cell>
+                    <cell>2</cell>
+                    <cell>3</cell>
+                    <cell>â€“</cell>
+                    <cell>â€“</cell>
+                    <cell>â€“</cell>
+                    <cell>â€“</cell>
+                    <cell>â€“</cell>
+                    <cell>â€“</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>University of Illinois Urbana-Champaign</cell>
+                    <cell>6</cell>
+                    <cell>4</cell>
+                    <cell>0</cell>
+                    <cell>yes</cell>
+                    <cell>yes</cell>
+                    <cell>no</cell>
+                    <cell>yes</cell>
+                    <cell>yes</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>University of Wisconsin â€“ Madison</cell>
+                    <cell>8</cell>
+                    <cell>24</cell>
+                    <cell>2</cell>
+                    <cell>yes</cell>
+                    <cell>yes</cell>
+                    <cell>no</cell>
+                    <cell>no</cell>
+                    <cell>no</cell>
+                  </row>
+              <trailer xml:id="tab01">
+                <ref type="intern" target="#tab1">Tab. 1</ref>: Responses of library institutions to our request to
+                grant permission to publish excerpts of the scans for which they were contractors of
+                the digitization. Most institutions responded within a few working days and except
+                for the fact that most acknowledged the public domain of the items, the responses
+                were very diverse. Many answered that they are either not responsible or only
+                responsible for their Library Copy of the PDF. [Lassner et al. 2021]
+                <ref type="graphic" target="#ocr_2021_t1"/>
+              </trailer>
+            </table>
+           
+               <p>We have asked ten CHIs for permission
+                to publish image data that was digitized based on their collection in order to
+                publish them as part of an OCR ground truth data set under a CC-BY license. As shown
+                in <ref type="graphic" target="#tab01">Table 1</ref>, the institutions gave a wide
+                variety of responses. Many institutions acknowledged that the requested books are in
+                the public domain because they were published before the year 1880. However, there
+                is no general consensus on whether the CHIs are actually responsible for granting
+                these rights, especially if one wants to use the copy from the Google Books or
+                Internet Archive servers. Some institutions stated that they are only responsible
+                for their Library Copy of the scan and granted permission to publish only from that
+                source. Only two institutions, the Bayerische Staatsbibliothek and University of
+                Illinois Urbana-Champaign stated that they are responsible and that we are allowed
+                to also use the material that can be found on the Google Books or Internet Archive
+                servers. </p>
+            <p>This case study underlines the lack of
+                a clear and simple framework of reference that would be recognized and applied, and
+                would reflect on good practices in the relationships between CHIs and digital
+                scholarship. The lack of such a framework is addressed among others by the DARIAH
+                initiative of the Heritage Data Reuse Charter<note type="footnote"> See
+                  <ref type="bibliography" target="#baillot_data_2016">Baillot
+                        et al. 2016</ref>. For additional information on the DARIAH Heritage Data Reuse
+                        Charter, see <ref target="https://www.dariah.eu/activities/open-science/data-re-use/">data-re-use</ref>, 
+                  <ref type="bibliography" target="#heritage_data_2021">DARIAH 2021</ref>.</note> that was
+                launched in 2017. Another approach towards such a framework is that of the â€ºdigital
+                data librarianâ€¹.<note type="footnote">
+                        See
+                  <ref type="bibliography" target="#eclevia_data_2019">Eclevia
+                        et al. 2019</ref>.</note>
+            </p>
+            </div>
+            <div type="chapter">
+               <head>3. Description of the
+                data set </head>
+            <p>In the data set that we want to
+                publish in the context of our OCR ground truth, we do not own the copyright for the
+                image data.<note type="footnote"> The current version of the data set can be found at 
+                  <ref target="https://github.com/millawell/ocr-data/tree/master/data">ocr-data/data</ref>, 
+                  <ref type="bibliography" target="#ocr_data_2021">OCR-Data 2021</ref>.</note> We therefore
+                distinguish between the data set formula and the built data set. We publish the data
+                set formula which contains the transcriptions, the links to the images and a recipe
+                on how to build the data set.</p>
+            <p>The data set formula and source code
+                are published on Github<note type="footnote"> See <ref target="https://github.com/millawell/ocr-data/">ocr-data</ref>, 
+                  <ref type="bibliography" target="#ocr_data_2021">OCR-Data 2021</ref>.</note> and the version
+                1.1 we are referring to in this paper is mirrored on the open access repository
+                    Zenodo.<note type="footnote"> See
+                      <ref type="bibliography" target="#lassner_data_2021">Lassner
+                        et al. 2021</ref>.</note> The data set is published under a CC-BY 4.0 license
+                and the source code is published under an Apache license.</p>
+            <div type="subchapter">
+               <head>3.1 Origin</head>
+            <p>The built data set contains images
+                from editions of books by Walter Scott and William Shakespeare in the original
+                English and in translations into German that were published around 1830. </p>
+            <p>The data set was created as part of a
+                research project that investigates how to implement stylometric methods that are
+                commonly used to analyze the style of authors with the goal of analyzing that of
+                translators. The data set was organized in such a way that other variables like
+                authors of the documents or publication date can be ruled out as a confounder of the
+                translator style. </p>
+            <p>We found that 1830 Germany was
+                especially suitable for the research setting we had in mind. Due to an increased
+                readership in Germany around 1830, there was a growing demand in books. Translating
+                foreign publications into German turned out to be particularly profitable because,
+                at that time, there was no copyright regulation that would apply equally across
+                German-speaking states. There was no general legal constraint to regulate payments
+                to the original authors of books or as to who was allowed to publish a German
+                translation of a book. Therefore, publishers were competing in translating most
+                recent foreign works into German, which resulted in multiple German translations by
+                different translators of the same book at the same time. To be the first one to
+                publish a translation into German, publishers resorted to what was later called
+                translation factories, optimized for translation speed.<note type="footnote"> See
+                  <ref type="bibliography" target="#bachleiter_uebersetzungsfabriken_1989">Bachleitner
+                        1989</ref>.</note> The translators working in such â€ºtranslation factoriesâ€¹
+                were not specialized in the translation of one specific author. It is in fact not
+                rare to find books from different authors translated by the same translator.</p>
+            </div>
+            <div type="subchapter">
+               <head>3.2 Method</head>
+            <p>We identified three translators who
+                all translated books from both Shakespeare and Scott, sometimes even the same books.
+                We also identified the English editions that were most likely to have been used by
+                the translators. This enabled us to set up a book-level parallel English-German
+                corpus allowing us to, again, rule out the confounding author signal.</p>
+            <p>As the constructed data set is only
+                available in the form of PDFs from Google Books and the Internet Archive or the
+                respective partner institutions, OCR was a necessary step for applying stylometric
+                tools on the text corpus. To assess the quality of off-the-shelf OCR methods and to
+                improve the OCR quality, for each book, a random set of pages was chosen for manual
+                transcription. </p>
+            <div type="subchapter">
+               <head>3.2.1 Preparation</head>
+            <p>Following the OCR-D initiativeâ€™s
+                specifications and best practices,<note type="footnote"> See <ref target="https://ocr-d.de/en/spec/">ocr-d
+                  spec</ref>, <ref type="bibliography" target="#ocrd_ocrd_2021">OCR-D
+                    2021</ref>.</note> for each book, we created a METS<note type="footnote"> See <ref target="http://www.loc.gov/standards/mets/">METS</ref>, 
+                      <ref type="bibliography" target="#mets_loc_2021">The Library of Congress
+                            2021</ref>.</note> file that contains the link to the source PDF as
+                well as the chosen pages. The following example presents an excerpt from one of the
+                METS files:</p>
+              <figure>
+                <graphic xml:id="ocr_2021_001" url=".../medien/ocr_2021_001.png">
+                  <desc>
+                    <ref type="graphic" target="#abb1">Fig. 1</ref>: Excerpt of a METS file as used in our data set. For
+                    each book, we created one METS file. The link to the resource contains the
+                    identifier and the page number. [<ref type="bibliography" target="#lassner_data_2021">Lassner
+                      et al. 2021</ref>]<ref
+                      type="graphic" target="#ocr_2021_001"/>
+                  </desc>
+                </graphic>
+              </figure>
+            <p>The PDFs have been downloaded from the
+                URLs in this METS file, and the page images have been extracted from the PDF,
+                deskewed and saved as PNG files.<note type="footnote"> The process is implemented in the pdfs.py submodule
+                            <ref target="https://github.com/millawell/ocr-data/blob/master/utils/pdfs.py#L23">pdfs.py:23</ref> and it uses the
+                  command line tools imagemagick and pdfimages, see <ref type="bibliography" target="#ocr_data_2021">OCR-Data 2021</ref>.</note>
+            </p>
+            </div>
+            <div type="subchapter">
+               <head>3.2.2 Transcription</head>
+            
+            <p>For transcription, the standard layout
+                analyzer of Kraken 2.0.8 (depending on the layout either with black or white column
+                separators) has been used and the transcription was pre-filled with either the
+                German Fraktur or the English off-the-shelf model and post-corrected manually. To
+                ensure consistency, some characters were normalized: for example, we encountered
+                multiple hyphenation characters such as <hi rend="bold">-</hi> and
+                    <hi rend="bold">â¸—</hi> which were both transcribed by <hi rend="bold">-</hi>.</p>
+            </div>
+            <div type="subchapter">
+               <head>3.2.3 Size</head>
+            <p>In total, the data set contains 5,354
+                lines with 224,745 characters. It consists of German and English books from 1815 to
+                1852. A detailed description of the characteristics of the data set is shown in <ref type="graphic" target="#tab02">Table 2</ref>.</p>
+            </div>
+            </div>
+               <div type="subchapter">
+               <head>3.3 Reproducibility and Accessibility</head>
+            <p>The data set formula has been
+                published as a collection of PAGE files and METS files.<note type="footnote"> See
+                  <ref type="bibliography" target="#pletschacher_page_2010">Pletschacher
+                        / Antonacopoulos 2010</ref>.</note> The PAGE files contain the transcriptions
+                on line-level and the METS files serve as the container linking metadata, PDF
+                sources and the transcriptions. There exists one METS file per item (corresponding
+                to a Google Books or Internet Archive id) and one PAGE file per PDF page. The
+                following excerpt of an example PAGE file shows how to encode one line of text:</p>
+                 <figure>
+                   <graphic xml:id="ocr_2021_002" url=".../medien/ocr_2021_002.png">
+                     <desc>
+                       <ref type="graphic" target="#abb2">Fig. 2</ref>: Excerpt from the PAGE file showing the bounding box of
+                       the line on the page image and the corresponding text string. [<ref type="bibliography" target="#lassner_data_2021">Lassner
+                         et al. 2021</ref>]<ref
+                         type="graphic" target="#ocr_2021_002"/>
+                     </desc>
+                   </graphic>
+                 </figure>
+            <p>The <code>&lt;TextLine&gt;</code> contains the absolute pixel coordinates where the text is
+                located on the preprocessed PNG image and the <code>&lt;TextEquiv&gt;</code> holds the transcription of the line.</p>
+            <p>As shown above, the METS files contain
+                links to the PDFs. Additionally, the METS files contain links to the PAGE files as
+                shown in the following excerpt. </p>
+                 <figure>
+                   <graphic xml:id="ocr_2021_003" url=".../medien/ocr_2021_003.png">
+                     <desc>
+                       <ref type="graphic" target="#abb3">Fig. 3</ref>: Excerpt from the METS file as used in our data set. For
+                each book, we created one METS file. This part of the METS file contains the
+                references to the PAGE files. [<ref type="bibliography" target="#lassner_data_2021">Lassner
+                  et al. 2021</ref>]<ref
+                  type="graphic" target="#ocr_2021_003"/>
+                     </desc>
+                   </graphic>
+                 </figure>
+            <p>As one can see, there are links from
+                one METS file, namely the one encoding works by Walter Scottâ€™s, Volume 2, published
+                by the Schumann brothers in 1831 in Zwickau, identified by the Google Books id <code>2jMfAAAAMAAJ</code>, to multiple pages (and PAGE files).</p>
+            <p>Finally, the METS file contains the
+                relationship between the URLs and the PAGE files in the <code>&lt;mets:structMap&gt;</code> section of the file:</p>
+                 <figure>
+                   <graphic xml:id="ocr_2021_004" url=".../medien/ocr_2021_004.png">
+                     <desc>
+                       <ref type="graphic" target="#abb4">Fig. 4</ref>: Excerpt from the METS file as used in our data set. For
+                       each book, we created one METS file. Together with the links to the image resources
+                       shown in <ref type="graphic" target="#ocr_2021_001">Figure 1</ref>, and the links to the PAGE
+                       files, the METS file holds the connection between the text lines and the page
+                       images. [<ref type="bibliography" target="#lassner_data_2021">Lassner
+                         et al. 2021</ref>]<ref
+                         type="graphic" target="#ocr_2021_004"/>
+                     </desc>
+                   </graphic>
+                 </figure>
+            <p>In order to reuse the data set, a
+                scholar may then obtain the original image resources from the respective
+                institutions as PDFs, based on the links we provide in the METS files. Then, the
+                pair data set can be created by running the â€ºmake pair_outputâ€¹ command in the
+                â€ºpipelines/â€¹ directory. For each title, it extracts the PNG images from the PDF,
+                preprocesses them, extracts, crops and saves the line images along respective files
+                containing the text of the line.</p>
+            <p>Although the image data needs to be
+                downloaded manually, the data set can still be compiled within minutes. </p>
+                 
+               </div>
+            </div>
+           
+            <div type="chapter">
+               <head>4. Framework for
+                creating, publishing and reusing OCR ground truth data</head>
+            <p>We have published the framework we
+                developed for the second case study, which enables scholars to create and share
+                their own ground truth data set formulas when they are in the same situation of not
+                owning the copyright for the images they use. This framework offers both directions
+                of functionality: </p>
+            <list type="unordered">
+                <item>Creating an XML ground truth data set from transcriptions to share it with the
+                    public (data set formula) and </item>
+                <item>Compiling an XML ground truth data set into standard OCR ground truth data
+                    pairs to train an OCR model (built data set).<note type="footnote"> The documentation how to create a new or reproduce an 
+                      existing data set can be found at <ref target="https://github.com/millawell/ocr-data/blob/master/README.md">README.md</ref>, 
+                      <ref type="bibliography" target="#ocr_data_2021">OCR-Data 2021</ref>.</note>
+               </item>
+            </list>
+            <p>As already described in the 
+                    <ref type="intern" target="#hd5">Sections 3.2</ref> and <ref type="intern" target="#hd9">3.3</ref> there are multiple 
+              steps involved in the creation, publication and
+                reuse of the OCR data set. In this Section, we would like to show that our work is
+                not only relevant for scholars who want to reuse our data set but also for scholars
+                who would like to publish a novel OCR ground truth data set in a similar copyright
+                setting. </p>
+            <div type="subchapter">
+               <head>4.1 Creation and
+                Publication</head>
+            <list type="ordered">
+                <item>Corpus construction: selection of the relevant books and pages</item>
+                <item>Creation of the METS files<note type="footnote"> See <ref target="https://github.com/millawell/ocr-data/blob/master/data/mets_page_template.xml">mets_page_template.xml</ref>,
+                  <ref type="bibliography" target="#ocr_data_2021">OCR-Data 2021</ref>.</note>
+               </item>
+                <item>Transcription of the pages</item>
+                <item>Creation of the PAGE files<note type="footnote"> See <ref target="https://github.com/millawell/ocr-data/blob/master/pipelines/create_xml_files.py">
+                  create_xml_files.py</ref>, <ref type="bibliography" target="#ocr_data_2021">OCR-Data 2021</ref>.</note>
+               </item>
+                <item>Publication of the METS and the PAGE files</item>
+            </list>
+            </div>
+            <div type="subchapter">
+               <head>4.2 Reuse</head>
+            <list type="ordered">
+                <item>Download of the METS and PAGE files</item>
+                <item>Download of the PDFs as found in the METS files</item>
+                <item>Creation of the pair data set<note type="footnote"> See <ref target="https://github.com/millawell/ocr-data/blob/master/pipelines/extract_pair_dataset.py">extract_pair_dataset.py</ref>,
+                  <ref type="bibliography" target="#ocr_data_2021">OCR-Data 2021</ref>.</note>
+                </item>
+                <item>Training of the OCR models<note type="footnote"> See <ref target="https://github.com/millawell/ocr-data/blob/master/pipelines/train_ocr_model.py">
+                  train_ocr_model.py</ref>, <ref type="bibliography" target="#ocr_data_2021">OCR-Data 2021</ref>.</note>
+               </item>
+            </list>
+            <p>In the <ref type="intern" target="#hd9">Section 3.3</ref>, the steps listed in Reuse have been
+                described. The download of the transcriptions and the PDFs has to be done manually
+                but for the creation of the pair data set and the training of the models, automation
+                is provided with our framework. We would like to also automatize the download of the
+                PDFs; this, however, remains complicated to implement. The first reason for this is
+                a technical one: soon after starting the download, captchas appear (as early as by
+                the 3rd image), which hinders the automatization.
+                Another reason is the Google Books regulation itself. Page one of any Google Books
+                PDF states explicitly: </p>
+            <p>
+               <quote type="grosszitat">Keine automatisierten Abfragen.
+                    Senden Sie keine automatisierten Abfragen irgendwelcher Art an das
+                    Google-System. Wenn Sie Recherchen Ã¼ber maschinelle Ãœbersetzung, optische
+                    Zeichenerkennung oder andere Bereiche durchfÃ¼hren, in denen der Zugang zu Text
+                    in groÃŸen Mengen nÃ¼tzlich ist, wenden Sie sich bitte an uns. Wir fÃ¶rdern die
+                    Nutzung des Ã¶ffentlich zugÃ¤nglichen Materials fÃ¼r diese Zwecke und kÃ¶nnen Ihnen
+                    unter UmstÃ¤nden helfen.</quote><note type="footnote">When downloading any book 
+                    PDF from Google Books one page is prepended to the document. On this page, 
+                    the cited usage statement is presented. As an example, please consider 
+                      <ref target="https://books.googleusercontent.com/books/content?req=AKW5QacqJ1ytah-8JsyWYKfgLVnZGMYKbDlV_xg2ynjx_
+                        aaepDsn3n6q0CnzACs-ZyfZHd6O2QajiTZGiS8jng4nnH5kyY3xFjFOMbcRxaq1KF15JPVAQl-6en4LlMhGvzXe13qX2haJnRTvVGDAUa4W9_
+                        JG8toPUCCfVbqL8TF-GshZr4L9EgHZ6W4g2xUGqbRJjAs0ImImKkWhSDTUi-8jGATaViIV5xgVreVUKA4lgwFYxhpesnqlPwpOIDkJW8w3m0ztj49FPsVRDx8aepxC39l-b1Apuw">Walter Scott's Werke</ref>, 
+                      see <ref type="bibliography" target="#google_informationen_2006">Google Inc. 2006</ref>.</note>
+            </p>
+            <p>Finding a way to automatize download
+                could hence not be realized in the context of this project and will have to be
+                addressed in future work.<note type="footnote"> Our progress on this topic will be documented in issue 2 of our 
+                  <ref target="https://github.com/millawell/ocr-data/issues/2">github repository</ref>, see 
+                  <ref type="bibliography" target="#ocr_data_2021">OCR-Data 2021</ref>. </note>
+            </p>
+            <p>Additionally, we provide useful
+                templates and automation for the creation of a novel OCR ground truth data set. As
+                already described, we used the Kraken transcription interface to create the
+                transcription. In Kraken, the final version of the transcription is stored in HTML
+                files. We provide a script to convert the HTML transcriptions into PAGE files in
+                order to facilitate interoperability with other OCR ground truth data sets.</p>
+            <p>Finally, the pair data set can be
+                created from the PAGE transcriptions and the images of the PDFs and the OCR model
+                can be trained.</p>
+            </div>
+            </div>
+            <div type="chapter">
+               <head>5. Relevance of the
+                data set</head>
+            <p>In order to evaluate the impact that
+                the data set has on the accuracy of OCR models, we trained and tested model
+                performance in three different settings. In the first setting, we fine-tuned an
+                individual model for each book in our corpus using a training and an evaluation set
+                of that book and tested the performance of the model on a held-out test set from the
+                same book. In <ref type="graphic" target="#tab02">Table 2</ref>, we show how this data
+                set has dramatically improved the OCR accuracy on similar documents compared to
+                off-the-shelf OCR solutions. Especially in cases where the off-the-shelf model
+                (baseline) shows a weak performance, the performance gained by fine-tuning is
+                large.</p>
+            <p>In the second and third setting, we
+                split the data set into two groups: English Antiqua, German Fraktur. There was also
+                one German Antiqua book that we did not put into any of the two groups. For the
+                second setting, we split all data within a group randomly into train set, evaluation
+                set and test set and trained and tested an individual model for each group. In <ref type="graphic" target="#tab03">Table 3</ref>, the test performance of this setting
+                is shown. For both groups, the fine-tuning improves the character accuracy by a
+                large margin over the baseline accuracy. This experiment shows that overall, the
+                fine-tuning within a group improves the performance of that group and that patterns
+                are learned across individual books.</p>
+            <table>
+               
+                  <row>
+                    <cell>Google Books or
+                            Internet Archive identifier</cell>
+                    <cell>baseline
+                            model</cell>
+                    <cell>Train #
+                            lines</cell>
+                    <cell>Test #
+                            lines</cell>
+                    <cell>Train #
+                            chars</cell>
+                    <cell>Test #
+                            chars</cell>
+                    <cell>baseline
+                            character accuracy</cell>
+                    <cell>fine-tuned
+                            character accuracy</cell>
+                    <cell>Î´</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>rDUJAAAAQAAJ</cell>
+                    <cell>en_best</cell>
+                    <cell>82</cell>
+                    <cell>11</cell>
+                    <cell>3520</cell>
+                    <cell>493</cell>
+                    <cell>99.8</cell>
+                    <cell>100.0</cell>
+                    <cell>0.2</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>chroniclesofcano02scot</cell>
+                    <cell>en_best</cell>
+                    <cell>20</cell>
+                    <cell>3</cell>
+                    <cell>836</cell>
+                    <cell>97</cell>
+                    <cell>100.0</cell>
+                    <cell>100.0</cell>
+                    <cell>0.0</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>anneofgeierstein03scot</cell>
+                    <cell>en_best</cell>
+                    <cell>20</cell>
+                    <cell>3</cell>
+                    <cell>805</cell>
+                    <cell>138</cell>
+                    <cell>100.0</cell>
+                    <cell>100.0</cell>
+                    <cell>0.0</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>_QgOAAAAQAAJ</cell>
+                    <cell>en_best</cell>
+                    <cell>60</cell>
+                    <cell>8</cell>
+                    <cell>2659</cell>
+                    <cell>359</cell>
+                    <cell>95.54</cell>
+                    <cell>100.0</cell>
+                    <cell>4.46</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>chroniclesofcano03scot</cell>
+                    <cell>en_best</cell>
+                    <cell>40</cell>
+                    <cell>5</cell>
+                    <cell>1766</cell>
+                    <cell>185</cell>
+                    <cell>99.46</cell>
+                    <cell>99.46</cell>
+                    <cell>0.0</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>zviTtwEACAAJ</cell>
+                    <cell>fraktur_1_best</cell>
+                    <cell>66</cell>
+                    <cell>9</cell>
+                    <cell>3396</cell>
+                    <cell>519</cell>
+                    <cell>98.27</cell>
+                    <cell>99.23</cell>
+                    <cell>0.96</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>quentindurward02scotuoft</cell>
+                    <cell>en_best</cell>
+                    <cell>39</cell>
+                    <cell>5</cell>
+                    <cell>1748</cell>
+                    <cell>241</cell>
+                    <cell>99.17</cell>
+                    <cell>99.17</cell>
+                    <cell>0.0</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>3pVMAAAAcAAJ</cell>
+                    <cell>fraktur_1_best</cell>
+                    <cell>92</cell>
+                    <cell>12</cell>
+                    <cell>4830</cell>
+                    <cell>598</cell>
+                    <cell>96.49</cell>
+                    <cell>99.16</cell>
+                    <cell>2.67</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>2jMfAAAAMAAJ</cell>
+                    <cell>fraktur_1_best</cell>
+                    <cell>157</cell>
+                    <cell>20</cell>
+                    <cell>7386</cell>
+                    <cell>939</cell>
+                    <cell>93.5</cell>
+                    <cell>98.94</cell>
+                    <cell>5.44</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>t88yAQAAMAAJ</cell>
+                    <cell>fraktur_1_best</cell>
+                    <cell>84</cell>
+                    <cell>11</cell>
+                    <cell>3345</cell>
+                    <cell>436</cell>
+                    <cell>94.5</cell>
+                    <cell>98.85</cell>
+                    <cell>4.35</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>HCRMAAAAcAAJ</cell>
+                    <cell>fraktur_1_best</cell>
+                    <cell>125</cell>
+                    <cell>16</cell>
+                    <cell>5100</cell>
+                    <cell>579</cell>
+                    <cell>92.23</cell>
+                    <cell>98.79</cell>
+                    <cell>6.56</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>zDTMtgEACAAJ</cell>
+                    <cell>fraktur_1_best</cell>
+                    <cell>76</cell>
+                    <cell>10</cell>
+                    <cell>4277</cell>
+                    <cell>560</cell>
+                    <cell>93.93</cell>
+                    <cell>98.75</cell>
+                    <cell>4.82</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>DNUwAQAAMAAJ</cell>
+                    <cell>fraktur_1_best</cell>
+                    <cell>76</cell>
+                    <cell>10</cell>
+                    <cell>4147</cell>
+                    <cell>517</cell>
+                    <cell>94.58</cell>
+                    <cell>98.45</cell>
+                    <cell>3.87</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>H9UwAQAAMAAJ</cell>
+                    <cell>fraktur_1_best</cell>
+                    <cell>76</cell>
+                    <cell>10</cell>
+                    <cell>4017</cell>
+                    <cell>533</cell>
+                    <cell>97.19</cell>
+                    <cell>98.31</cell>
+                    <cell>1.12</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>AdiKyqdlp4cC</cell>
+                    <cell>fraktur_1_best</cell>
+                    <cell>77</cell>
+                    <cell>10</cell>
+                    <cell>2827</cell>
+                    <cell>405</cell>
+                    <cell>92.84</cell>
+                    <cell>98.27</cell>
+                    <cell>5.43</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>J4knAAAAMAAJ</cell>
+                    <cell>en_best</cell>
+                    <cell>20</cell>
+                    <cell>3</cell>
+                    <cell>851</cell>
+                    <cell>104</cell>
+                    <cell>97.12</cell>
+                    <cell>98.08</cell>
+                    <cell>0.96</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>aNQwAQAAMAAJ</cell>
+                    <cell>fraktur_1_best</cell>
+                    <cell>52</cell>
+                    <cell>7</cell>
+                    <cell>2752</cell>
+                    <cell>309</cell>
+                    <cell>95.79</cell>
+                    <cell>98.06</cell>
+                    <cell>2.27</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>XtEyAQAAMAAJ</cell>
+                    <cell>fraktur_1_best</cell>
+                    <cell>86</cell>
+                    <cell>11</cell>
+                    <cell>3489</cell>
+                    <cell>383</cell>
+                    <cell>94.52</cell>
+                    <cell>97.91</cell>
+                    <cell>3.39</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>D5pMAAAAcAAJ</cell>
+                    <cell>fraktur_1_best</cell>
+                    <cell>88</cell>
+                    <cell>12</cell>
+                    <cell>4557</cell>
+                    <cell>546</cell>
+                    <cell>93.22</cell>
+                    <cell>97.8</cell>
+                    <cell>4.58</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>8AQoAAAAYAAJ</cell>
+                    <cell>fraktur_1_best</cell>
+                    <cell>71</cell>
+                    <cell>9</cell>
+                    <cell>3130</cell>
+                    <cell>434</cell>
+                    <cell>94.93</cell>
+                    <cell>97.7</cell>
+                    <cell>2.77</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>Fy4JAAAAQAAJ</cell>
+                    <cell>en_best</cell>
+                    <cell>20</cell>
+                    <cell>3</cell>
+                    <cell>743</cell>
+                    <cell>125</cell>
+                    <cell>96.0</cell>
+                    <cell>97.6</cell>
+                    <cell>1.6</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>anneofgeierstein02scot</cell>
+                    <cell>en_best</cell>
+                    <cell>42</cell>
+                    <cell>6</cell>
+                    <cell>1747</cell>
+                    <cell>204</cell>
+                    <cell>98.04</cell>
+                    <cell>97.55</cell>
+                    <cell>-0.49</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>u4cnAAAAMAAJ</cell>
+                    <cell>fraktur_1_best</cell>
+                    <cell>76</cell>
+                    <cell>10</cell>
+                    <cell>3936</cell>
+                    <cell>553</cell>
+                    <cell>91.5</cell>
+                    <cell>97.11</cell>
+                    <cell>5.61</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>1VUJAAAAQAAJ</cell>
+                    <cell>en_best</cell>
+                    <cell>85</cell>
+                    <cell>11</cell>
+                    <cell>3899</cell>
+                    <cell>455</cell>
+                    <cell>94.73</cell>
+                    <cell>96.7</cell>
+                    <cell>1.97</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>quentindurward01scotuoft</cell>
+                    <cell>en_best</cell>
+                    <cell>20</cell>
+                    <cell>3</cell>
+                    <cell>708</cell>
+                    <cell>86</cell>
+                    <cell>95.35</cell>
+                    <cell>95.35</cell>
+                    <cell>0.0</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>4zQfAAAAMAAJ</cell>
+                    <cell>fraktur_1_best</cell>
+                    <cell>159</cell>
+                    <cell>20</cell>
+                    <cell>6817</cell>
+                    <cell>932</cell>
+                    <cell>87.98</cell>
+                    <cell>94.74</cell>
+                    <cell>6.76</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>7JVMAAAAcAAJ</cell>
+                    <cell>fraktur_1_best</cell>
+                    <cell>89</cell>
+                    <cell>12</cell>
+                    <cell>4604</cell>
+                    <cell>616</cell>
+                    <cell>65.91</cell>
+                    <cell>94.32</cell>
+                    <cell>28.41</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>YAZXAAAAcAAJ</cell>
+                    <cell>fraktur_1_best</cell>
+                    <cell>1752</cell>
+                    <cell>219</cell>
+                    <cell>66253</cell>
+                    <cell>8327</cell>
+                    <cell>80.17</cell>
+                    <cell>93.61</cell>
+                    <cell>13.44</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>8dAyAQAAMAAJ</cell>
+                    <cell>fraktur_1_best</cell>
+                    <cell>88</cell>
+                    <cell>12</cell>
+                    <cell>3448</cell>
+                    <cell>380</cell>
+                    <cell>87.11</cell>
+                    <cell>93.42</cell>
+                    <cell>6.31</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>PzMJAAAAQAAJ</cell>
+                    <cell>en_best</cell>
+                    <cell>61</cell>
+                    <cell>8</cell>
+                    <cell>2294</cell>
+                    <cell>234</cell>
+                    <cell>90.17</cell>
+                    <cell>92.74</cell>
+                    <cell>2.57</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>wggOAAAAQAAJ</cell>
+                    <cell>en_best</cell>
+                    <cell>19</cell>
+                    <cell>3</cell>
+                    <cell>716</cell>
+                    <cell>94</cell>
+                    <cell>91.49</cell>
+                    <cell>92.55</cell>
+                    <cell>1.06</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>WjMfAAAAMAAJ</cell>
+                    <cell>fraktur_1_best</cell>
+                    <cell>183</cell>
+                    <cell>23</cell>
+                    <cell>7363</cell>
+                    <cell>814</cell>
+                    <cell>71.62</cell>
+                    <cell>91.52</cell>
+                    <cell>19.9</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>MzQJAAAAQAAJ</cell>
+                    <cell>en_best</cell>
+                    <cell>36</cell>
+                    <cell>5</cell>
+                    <cell>1265</cell>
+                    <cell>201</cell>
+                    <cell>88.56</cell>
+                    <cell>90.55</cell>
+                    <cell>1.99</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>fAoOAAAAQAAJ</cell>
+                    <cell>en_best</cell>
+                    <cell>40</cell>
+                    <cell>6</cell>
+                    <cell>1675</cell>
+                    <cell>121</cell>
+                    <cell>86.78</cell>
+                    <cell>87.6</cell>
+                    <cell>0.82</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>kggOAAAAQAAJ</cell>
+                    <cell>en_best</cell>
+                    <cell>40</cell>
+                    <cell>6</cell>
+                    <cell>1572</cell>
+                    <cell>243</cell>
+                    <cell>82.72</cell>
+                    <cell>82.72</cell>
+                    <cell>0.0</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>oNEyAQAAMAAJ</cell>
+                    <cell>fraktur_1_best</cell>
+                    <cell>73</cell>
+                    <cell>10</cell>
+                    <cell>2874</cell>
+                    <cell>386</cell>
+                    <cell>68.39</cell>
+                    <cell>79.02</cell>
+                    <cell>10.63</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>htQwAQAAMAAJ</cell>
+                    <cell>fraktur_1_best</cell>
+                    <cell>78</cell>
+                    <cell>10</cell>
+                    <cell>3990</cell>
+                    <cell>464</cell>
+                    <cell>69.18</cell>
+                    <cell>78.02</cell>
+                    <cell>8.84</cell>
+                  </row>
+              <trailer xml:id="tab02">
+                <ref type="intern" target="#tab2">Tab. 2</ref>: Performance comparison of baseline model and fine-tuned
+                model for each document in our corpus. For almost all documents there is a large
+                improvement over the baseline even with a very limited number of fine-tuning
+                samples. The sum of lines and characters depicted in the table do not add up to the
+                numbers reported in the text because during training we used an additional split of
+                the data as an evaluation set that had the same size as the test set respectively.
+                [Lassner et al. 2021]<ref type="graphic" target="#ocr_2021_t2"/>
+              </trailer>
+            </table>
+               
+            
+            <table>
+               
+                  <row>
+                    <cell>Document
+                            Group</cell>
+                    <cell>baseline
+                            model</cell>
+                    <cell>Train #
+                            lines</cell>
+                    <cell>Test #
+                            lines</cell>
+                    <cell>Train #
+                            chars</cell>
+                    <cell>Test #
+                            chars</cell>
+                    <cell>baseline
+                            character accuracy</cell>
+                    <cell>fine-tuned
+                            character accuracy</cell>
+                    <cell>Î´</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>English Antiqua</cell>
+                    <cell>en_best</cell>
+                    <cell>650</cell>
+                    <cell>82</cell>
+                    <cell>26793</cell>
+                    <cell>3406</cell>
+                    <cell>94.19</cell>
+                    <cell>96.21</cell>
+                    <cell>2.02</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>German Fraktur</cell>
+                    <cell>fraktur_1_best</cell>
+                    <cell>3449</cell>
+                    <cell>432</cell>
+                    <cell>145928</cell>
+                    <cell>17577</cell>
+                    <cell>85.89</cell>
+                    <cell>95.99</cell>
+                    <cell>10.1</cell>
+                  </row>
+              <trailer xml:id="tab03">
+                <ref type="intern" target="#tab3">Tab. 3</ref>: Performance comparison of baseline model and fine-tuned
+                model trained on a random splits of samples within the same group. [Lassner et al. 2021]<ref type="graphic" target="#ocr_2021_t3"/>
+              </trailer>
+            </table>
+          
+            <table>
+               
+                  <row>
+                    <cell>Left-out
+                            identifier</cell>
+                    <cell>baseline
+                            model</cell>
+                    <cell>Train #
+                            lines</cell>
+                    <cell>Test #
+                            lines</cell>
+                    <cell>Train #
+                            chars</cell>
+                    <cell>Test #
+                            chars</cell>
+                    <cell>baseline
+                            character accuracy</cell>
+                    <cell>fine-tuned
+                            character accuracy</cell>
+                    <cell>Î´</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>chroniclesofcano03scot</cell>
+                    <cell>en_best</cell>
+                    <cell>686</cell>
+                    <cell>50</cell>
+                    <cell>28134</cell>
+                    <cell>2182</cell>
+                    <cell>99.22</cell>
+                    <cell>99.59</cell>
+                    <cell>0.37</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>H9UwAQAAMAAJ</cell>
+                    <cell>fraktur_1_best</cell>
+                    <cell>3794</cell>
+                    <cell>96</cell>
+                    <cell>159088</cell>
+                    <cell>5130</cell>
+                    <cell>96.74</cell>
+                    <cell>99.57</cell>
+                    <cell>2.83</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>aNQwAQAAMAAJ</cell>
+                    <cell>fraktur_1_best</cell>
+                    <cell>3822</cell>
+                    <cell>65</cell>
+                    <cell>161053</cell>
+                    <cell>3397</cell>
+                    <cell>97.0</cell>
+                    <cell>99.53</cell>
+                    <cell>2.53</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>chroniclesofcano02scot</cell>
+                    <cell>en_best</cell>
+                    <cell>709</cell>
+                    <cell>25</cell>
+                    <cell>29226</cell>
+                    <cell>1017</cell>
+                    <cell>99.02</cell>
+                    <cell>99.51</cell>
+                    <cell>0.49</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>zDTMtgEACAAJ</cell>
+                    <cell>fraktur_1_best</cell>
+                    <cell>3794</cell>
+                    <cell>96</cell>
+                    <cell>159131</cell>
+                    <cell>5430</cell>
+                    <cell>95.05</cell>
+                    <cell>99.43</cell>
+                    <cell>4.38</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>anneofgeierstein03scot</cell>
+                    <cell>en_best</cell>
+                    <cell>708</cell>
+                    <cell>26</cell>
+                    <cell>29144</cell>
+                    <cell>1062</cell>
+                    <cell>98.68</cell>
+                    <cell>99.34</cell>
+                    <cell>0.66</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>t88yAQAAMAAJ</cell>
+                    <cell>fraktur_1_best</cell>
+                    <cell>3786</cell>
+                    <cell>105</cell>
+                    <cell>160286</cell>
+                    <cell>4181</cell>
+                    <cell>91.13</cell>
+                    <cell>99.28</cell>
+                    <cell>8.15</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>anneofgeierstein02scot</cell>
+                    <cell>en_best</cell>
+                    <cell>684</cell>
+                    <cell>53</cell>
+                    <cell>28053</cell>
+                    <cell>2181</cell>
+                    <cell>98.3</cell>
+                    <cell>99.27</cell>
+                    <cell>0.97</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>DNUwAQAAMAAJ</cell>
+                    <cell>fraktur_1_best</cell>
+                    <cell>3794</cell>
+                    <cell>96</cell>
+                    <cell>159113</cell>
+                    <cell>5228</cell>
+                    <cell>95.26</cell>
+                    <cell>99.01</cell>
+                    <cell>3.75</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>D5pMAAAAcAAJ</cell>
+                    <cell>fraktur_1_best</cell>
+                    <cell>3780</cell>
+                    <cell>111</cell>
+                    <cell>159386</cell>
+                    <cell>5660</cell>
+                    <cell>93.69</cell>
+                    <cell>99.01</cell>
+                    <cell>5.32</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>3pVMAAAAcAAJ</cell>
+                    <cell>fraktur_1_best</cell>
+                    <cell>3777</cell>
+                    <cell>115</cell>
+                    <cell>158561</cell>
+                    <cell>6036</cell>
+                    <cell>94.68</cell>
+                    <cell>98.99</cell>
+                    <cell>4.31</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>zviTtwEACAAJ</cell>
+                    <cell>fraktur_1_best</cell>
+                    <cell>3806</cell>
+                    <cell>83</cell>
+                    <cell>159741</cell>
+                    <cell>4384</cell>
+                    <cell>95.76</cell>
+                    <cell>98.97</cell>
+                    <cell>3.21</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>8AQoAAAAYAAJ</cell>
+                    <cell>fraktur_1_best</cell>
+                    <cell>3800</cell>
+                    <cell>89</cell>
+                    <cell>160966</cell>
+                    <cell>3926</cell>
+                    <cell>94.7</cell>
+                    <cell>98.9</cell>
+                    <cell>4.2</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>1VUJAAAAQAAJ</cell>
+                    <cell>en_best</cell>
+                    <cell>635</cell>
+                    <cell>107</cell>
+                    <cell>25735</cell>
+                    <cell>4839</cell>
+                    <cell>96.88</cell>
+                    <cell>98.8</cell>
+                    <cell>1.92</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>AdiKyqdlp4cC</cell>
+                    <cell>fraktur_1_best</cell>
+                    <cell>3793</cell>
+                    <cell>97</cell>
+                    <cell>160065</cell>
+                    <cell>3736</cell>
+                    <cell>92.34</cell>
+                    <cell>98.47</cell>
+                    <cell>6.13</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>rDUJAAAAQAAJ</cell>
+                    <cell>en_best</cell>
+                    <cell>639</cell>
+                    <cell>103</cell>
+                    <cell>26265</cell>
+                    <cell>4419</cell>
+                    <cell>97.85</cell>
+                    <cell>98.42</cell>
+                    <cell>0.57</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>quentindurward02scotuoft</cell>
+                    <cell>en_best</cell>
+                    <cell>687</cell>
+                    <cell>49</cell>
+                    <cell>28274</cell>
+                    <cell>2223</cell>
+                    <cell>97.35</cell>
+                    <cell>98.34</cell>
+                    <cell>0.99</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>HCRMAAAAcAAJ</cell>
+                    <cell>fraktur_1_best</cell>
+                    <cell>3739</cell>
+                    <cell>157</cell>
+                    <cell>158250</cell>
+                    <cell>6378</cell>
+                    <cell>91.28</cell>
+                    <cell>98.28</cell>
+                    <cell>7.0</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>J4knAAAAMAAJ</cell>
+                    <cell>en_best</cell>
+                    <cell>708</cell>
+                    <cell>26</cell>
+                    <cell>29219</cell>
+                    <cell>1089</cell>
+                    <cell>97.15</cell>
+                    <cell>98.07</cell>
+                    <cell>0.92</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>2jMfAAAAMAAJ</cell>
+                    <cell>fraktur_1_best</cell>
+                    <cell>3703</cell>
+                    <cell>197</cell>
+                    <cell>155342</cell>
+                    <cell>9181</cell>
+                    <cell>92.43</cell>
+                    <cell>98.04</cell>
+                    <cell>5.61</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>XtEyAQAAMAAJ</cell>
+                    <cell>fraktur_1_best</cell>
+                    <cell>3783</cell>
+                    <cell>108</cell>
+                    <cell>160349</cell>
+                    <cell>4322</cell>
+                    <cell>87.69</cell>
+                    <cell>97.59</cell>
+                    <cell>9.9</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>quentindurward01scotuoft</cell>
+                    <cell>en_best</cell>
+                    <cell>708</cell>
+                    <cell>26</cell>
+                    <cell>29284</cell>
+                    <cell>940</cell>
+                    <cell>96.38</cell>
+                    <cell>97.13</cell>
+                    <cell>0.75</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>wggOAAAAQAAJ</cell>
+                    <cell>en_best</cell>
+                    <cell>710</cell>
+                    <cell>24</cell>
+                    <cell>29362</cell>
+                    <cell>869</cell>
+                    <cell>92.52</cell>
+                    <cell>96.89</cell>
+                    <cell>4.37</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>_QgOAAAAQAAJ</cell>
+                    <cell>en_best</cell>
+                    <cell>664</cell>
+                    <cell>75</cell>
+                    <cell>27117</cell>
+                    <cell>3320</cell>
+                    <cell>94.43</cell>
+                    <cell>96.66</cell>
+                    <cell>2.23</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>fAoOAAAAQAAJ</cell>
+                    <cell>en_best</cell>
+                    <cell>685</cell>
+                    <cell>51</cell>
+                    <cell>28128</cell>
+                    <cell>2007</cell>
+                    <cell>94.72</cell>
+                    <cell>96.61</cell>
+                    <cell>1.89</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>4zQfAAAAMAAJ</cell>
+                    <cell>fraktur_1_best</cell>
+                    <cell>3701</cell>
+                    <cell>199</cell>
+                    <cell>156399</cell>
+                    <cell>8681</cell>
+                    <cell>88.68</cell>
+                    <cell>96.37</cell>
+                    <cell>7.69</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>PzMJAAAAQAAJ</cell>
+                    <cell>en_best</cell>
+                    <cell>662</cell>
+                    <cell>77</cell>
+                    <cell>27724</cell>
+                    <cell>2817</cell>
+                    <cell>90.7</cell>
+                    <cell>95.49</cell>
+                    <cell>4.79</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>u4cnAAAAMAAJ</cell>
+                    <cell>fraktur_1_best</cell>
+                    <cell>3795</cell>
+                    <cell>95</cell>
+                    <cell>159827</cell>
+                    <cell>4889</cell>
+                    <cell>91.31</cell>
+                    <cell>95.21</cell>
+                    <cell>3.9</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>7JVMAAAAcAAJ</cell>
+                    <cell>fraktur_1_best</cell>
+                    <cell>3780</cell>
+                    <cell>112</cell>
+                    <cell>159080</cell>
+                    <cell>5816</cell>
+                    <cell>71.35</cell>
+                    <cell>94.62</cell>
+                    <cell>23.27</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>8dAyAQAAMAAJ</cell>
+                    <cell>fraktur_1_best</cell>
+                    <cell>3780</cell>
+                    <cell>111</cell>
+                    <cell>159841</cell>
+                    <cell>4271</cell>
+                    <cell>84.45</cell>
+                    <cell>94.24</cell>
+                    <cell>9.79</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>htQwAQAAMAAJ</cell>
+                    <cell>fraktur_1_best</cell>
+                    <cell>3792</cell>
+                    <cell>98</cell>
+                    <cell>158623</cell>
+                    <cell>4996</cell>
+                    <cell>88.42</cell>
+                    <cell>94.14</cell>
+                    <cell>5.72</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>YAZXAAAAcAAJ</cell>
+                    <cell>fraktur_1_best</cell>
+                    <cell>1909</cell>
+                    <cell>2190</cell>
+                    <cell>89328</cell>
+                    <cell>82910</cell>
+                    <cell>80.68</cell>
+                    <cell>92.92</cell>
+                    <cell>12.24</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>MzQJAAAAQAAJ</cell>
+                    <cell>en_best</cell>
+                    <cell>691</cell>
+                    <cell>45</cell>
+                    <cell>28714</cell>
+                    <cell>1622</cell>
+                    <cell>84.9</cell>
+                    <cell>89.52</cell>
+                    <cell>4.62</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>kggOAAAAQAAJ</cell>
+                    <cell>en_best</cell>
+                    <cell>685</cell>
+                    <cell>51</cell>
+                    <cell>28216</cell>
+                    <cell>1983</cell>
+                    <cell>85.64</cell>
+                    <cell>87.56</cell>
+                    <cell>1.92</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>Fy4JAAAAQAAJ</cell>
+                    <cell>en_best</cell>
+                    <cell>709</cell>
+                    <cell>25</cell>
+                    <cell>29424</cell>
+                    <cell>943</cell>
+                    <cell>78.9</cell>
+                    <cell>85.15</cell>
+                    <cell>6.25</cell>
+                  </row>
+               
+               
+                  <row>
+                    <cell>oNEyAQAAMAAJ</cell>
+                    <cell>fraktur_1_best</cell>
+                    <cell>3798</cell>
+                    <cell>92</cell>
+                    <cell>160955</cell>
+                    <cell>3589</cell>
+                    <cell>66.31</cell>
+                    <cell>84.79</cell>
+                    <cell>18.48</cell>
+                  </row>
+              <trailer xml:id="tab04">
+                <ref type="intern" target="#tab4">Tab. 4</ref>: Model performance evaluated with a leave-one-out
+                  strategy. Within each group (German Fraktur and English Antiqua), an individual
+                  model is trained on all samples except from the left-out identifier on which the
+                  model is tested afterwards. The performance of the fine-tuned model is improved in
+                  each case, often by a large margin. [Lassner et al. 2021]<ref type="graphic" target="#ocr_2021_t4"/>
+              </trailer>
+            </table>
+            <p>In the third setting, we trained
+                multiple models within each group, always training on all books of that group except
+                one and using only the data of the left-out book for testing. In all settings, we
+                also report the performance of the off-the-shelf OCR model on the test set for
+                comparison.</p>
+            <p>As depicted in <ref type="graphic" target="#tab04">Table 4</ref>, the performance of fine tuning
+                improves character accuracy each time even for the held-out book. This shows that
+                the fine-tuned model indeed did not overfit on a specific book but captures patterns
+                of a specific script. We should note, that in some cases of the third experiment
+                different volumes occur as individual samples, for example, the second volume of
+                Anne of Geierstein by Scott was not held-out when tested for the third volume of
+                Anne of Geierstein. Scripts in different volumes are often more similar than scripts
+                of the same font type which might improve the outcome of this experiments in some
+                cases.</p>
+            <p>For all three experiments, the Kraken
+                OCR engine with a German Fraktur model and an English model was used as baselines.
+                They were provided by the maintainers of Kraken.<note type="footnote"> See
+                  <ref type="bibliography" target="#kiessling_kraken_2019">Kiessling
+                        2019</ref>. For baselines and fine-tuning version 3.0.4 of the Kraken engine was
+                        used that can be found at <ref target="https://github.com/mittagessen/kraken/releases/tag/3.0.4">kraken release 3.0.4</ref>, 
+                  <ref type="bibliography" target="#kiessling_kraken_2019">Kiessling
+                        2021</ref>.</note>
+            </p>
+            <p>In the context of the research project
+                for which this data set was created, the performance gain is especially relevant as
+                research shows that a certain level of OCR quality is needed in order to be able to
+                obtain meaningful results on downstream tasks. For example, Hamdi et al. show the
+                importance of OCR quality on the performance of Named Entity Recognition as a
+                downstream task.<note type="footnote">
+                        See
+                  <ref type="bibliography" target="#hamdi_impact_2020">Hamdi
+                        et al. 2020</ref>.</note> With additional cross training of sub-corpora we are
+                confident that we will be able to push the character accuracy beyond 95% on all test
+                sets that will enable us to perform translatorship attribution analysis.</p>
+            <p>More generally, the results show that
+                in a variety of settings, additional ground truth data will improve the OCR results.
+                This advocates strongly for the publication of a greater range of, and especially
+                more diverse, sets of open and reusable ground truth data for historical prints.</p>
+            <p>The data set we thus created and
+                published is open and reproducible following the described framework. It can serve
+                as a template for other OCR ground truth data set projects. It is therefore not only
+                relevant because it shows why the community should create additional data sets: it
+                also shows how to create the data sets and invites to new publications bound to
+                bring Digital Humanities research a step forward. </p>
+            <p>The data pairs are compatible with
+                other OCR ground truth data sets such as e. g. OCR-D<note type="footnote"> See
+                  <ref type="bibliography" target="#baierer_ocr_2019">Baierer
+                        et al. 2019</ref>.</note> or GT4HistOCR<note type="footnote"> See
+                          <ref type="bibliography" target="#springmann_truth_2018">Springmann
+                        et al. 2018</ref>.</note>. Using the established PAGE-XML standard enables
+                interoperability and reusability of the transcriptions. Using open licenses for the
+                source code and the data, and publishing releases at an institutional open data
+                repository ensures representativeness and durability.</p>
+            </div>
+            <div type="chapter">
+               <head>6. Conclusion</head>
+            <p>The work we realized in order to
+                constitute the data set we need for our stylometric research provided not only a
+                ground truth data set, but also a systematic approach to the legal issues we
+                encountered in the extraction of information from the scanned books we rely on as a
+                primary source. While we have been successful at automating many work steps,
+                improvements could still be envisioned.</p>
+            <p>In future work, we would like to
+                enrich the links to the original resource with additional links to mirrors of the
+                resources in order to increase the persistence of the image sources, whenever
+                available also adding OCLC IDs as universal identifiers.<note type="footnote"> OCLC is a registry of IDs referencing items in libraries, see <ref target="https://www.worldcat.org/">worldcat.org</ref>, OCLC 2021.</note> We would also like
+                to look into ways to automate the download of the PDFs from Google Books, the
+                Internet Archive or CHIs. Also, we would like to extend the framework we proposed
+                here. It could serve for hybrid data sets with parts where the copyright for the
+                image data is unclear (then published as data set formula), and others with approved
+                image redistribution (which could then be published as a built data set). It could
+                be used for example for the datasets from Bayerische Staatsbibliothek and University
+                of Illinois Urbana-Champaign.</p>
+            <p>Finally, we would like to encourage
+                scholars to publish their OCR ground truth data set in a similarly open and
+                interoperable manner, thus making it possible to ultimately increase accessibility
+                to archives and libraries for everyone.</p>
+            </div>
+               <div type="chapter">
+               <head>Acknowledgements</head>
+            <p>This work has been supported by the
+                German Federal Ministry for Education and Research as BIFOLD.</p>
+               </div>
+                  <div type="chapter">
+               <head>List of contracts</head>
+            <p>The contracts between</p>
+            <list type="unordered">
+                <item>a number of US-based libraries and Google is available <ref target="https://web.archive.org/web/20120707144623/http:/thepublicindex.org/docs/libraries/cic.pdf">here</ref>,</item>
+                <item>the British Library and Google is available <ref target="https://www.openrightsgroup.org/app/uploads/2020/03/BL-Google-Contract.pdf">here</ref>,</item>
+                <item>the National Library of the Netherlands and Google is available <ref target="https://web.archive.org/web/20111025094345/http:/www.kb.nl/nieuws/2011/contract-google-kb.pdf">here</ref>, </item>
+                <item>the University of Michigan and Google is available <ref target="http://web.archive.org/web/20050906002322/https:/www.lib.umich.edu/mdp/um-google-cooperative-agreement.pdf">here</ref>, </item>
+                <item>the University of Texas at Austin and Google is available <ref target="https://web.archive.org/web/20151226021049/https:/www.lib.utexas.edu/sites/default/files/google/utexas_google_agreement.pdf">here</ref>,</item>
+                <item>the University of Virginia and Google is available <ref target="https://web.archive.org/web/20120707144748/http:/thepublicindex.org/docs/libraries/virginia.pdf">here</ref>,</item>
+                <item>Scanning Solutions (for the Bibliotheque Municipale de Lyon) and Google is
+                    available <ref target="https://web.archive.org/web/20120707144718/http:/thepublicindex.org/docs/libraries/lyon_ae.pdf">here</ref>,</item>
+                <item>University of California and Google is available <ref target="https://web.archive.org/web/20120707144625/http:/thepublicindex.org/docs/libraries/california.pdf">here</ref>.</item>
+            </list>
+                  </div>
+            </div>
+            <div type="bibliography">
+               <head>Bibliographic references</head>
+            <listBibl>
+              <bibl xml:id="bachleiter_uebersetzungsfabriken_1989">Norbert
+                Bachleitner: Â»ÃœbersetzungsfabrikenÂ«: das deutsche Ãœbersetzungswesen in der ersten
+                HÃ¤lfte des 19. Jahrhunderts. In: Internationales Archiv fÃ¼r Sozialgeschichte der
+                deutschen Literatur 14 (1989), i.Â 1, pp. 1â€“50. <ptr type="gbv" cRef="129444383"
+                     /></bibl>
+              <bibl xml:id="baillot_data_2016">Anne Baillot / Mike Mertens / Laurent Romary: Data fluidity
+                in DARIAH â€“ pushing the agenda forward. In: Bibliothek Forschung und Praxis 39
+                (2016), i.Â 3, pp. 350â€“357. DOI: <ref target="https://doi.org/10.1515/bfp-2016-0039">10.1515/bfp-2016-0039</ref>
+                <ptr type="gbv" cRef="12961193X"
+                /></bibl>
+            <bibl xml:id="baierer_ocr_2019">Konstantin Baierer / Matthias Boenig / Clemens Neudecker:
+                Labelling OCR Ground Truth for Usage in Repositories. In: Proceedings of the
+                International Conference on Digital Access to Textual Cultural Heritage (DATeCH2019:
+                3, Brussels, 08.â€“10.05.2019) New York, NY 2019, pp. 3â€“8. <ptr type="gbv" cRef="1734515961"
+                     /></bibl>
+            <bibl xml:id="htr_united_2021">HTR-United. In: GitHub.io. By Alix ChaguÃ© / Thibault
+                ClÃ©rice. 2021. [<ref target="https://htr-united.github.io/">online</ref>]</bibl>
+            <bibl xml:id="eclevia_data_2019">Marian Ramos Eclevia / John Christopher La Torre Fredeluces
+                / Carlos Jr Lagrosas Eclevia / Roselle Saguibo Maestro: What Makes a Data Librarian?
+                An Analysis of Job Descriptions and Specifications for Data Librarian. In:
+                Qualitative and Quantitative Methods in Libraries 8 (2019), n. 3, pp. 273â€“290. [<ref target="http://qqml-journal.net/index.php/qqml/article/view/541">online</ref>]</bibl>
+            <bibl xml:id="engl_volltexte_2020">Elisabeth Engl: Volltexte fÃ¼r die FrÃ¼he Neuzeit. Der
+                Beitrag des OCR-D-Projekts zur Volltexterkennung frÃ¼hneuzeitlicher Drucke. In:
+                Zeitschrift fÃ¼r Historische Forschung 2 (2020), n. 47, pp. 223â€“250. <ptr type="gbv" cRef="129309338"
+                     /></bibl>
+            <bibl xml:id="eth_transcriptiones_2020">Transcriptiones. A platform for hosting, accessing and
+                sharing transcripts of non-digitised historical manuscripts. Ed. by ETH-Library.
+                ZÃ¼rich 2020. [<ref target="https://www.librarylab.ethz.ch/project/transcriptiones/">online</ref>]</bibl>
+            <bibl xml:id="hamdi_impact_2020">Ahmed Hamdi / Axel Jean-Caurant / Nicolas SidÃ¨re / MickaÃ«l
+                Coustaty: Assessing and Minimizing the Impact of OCR Quality on Named Entity
+                Recognition. In: Digital libraries for open knowledge.
+                International Conference on Theory and Practice of Digital Libraries. (TPDL: 24,
+                Lyon, 25.â€“27.08.2020) Cham 2020, pp. 87â€“101. <ptr type="gbv" cRef="173775262X"
+                     /></bibl>
+            <bibl xml:id="heritage_data_2021">The Heritage Data Reuse Charter. In: DARIAH.eu. 2021. 
+              [<ref target="https://www.dariah.eu/activities/open-science/data-re-use/">online</ref>]</bibl>
+              
+             
+              
+              <bibl xml:id="google_informationen_2006">Informationen und Richtlinien. Ed. by Google Inc. In: Google Books. Walter Scott: 
+                GroÃŸvater's EzÃ¤hlungen aus der Geschichte von Frankreich. Ed. by Georg Nicolaus BÃ¤rmann. Neue Folge. Zweiter Theil. Zwickau 1831. Digitalisiert am 15.11.2006. PDF. 
+                [<ref target="https://books.googleusercontent.com/books/content?req=AKW5QacqJ1ytah-8JsyWYKfgLVnZGMYKbDlV_xg2ynjx_aaepDsn3n6q0CnzACs-ZyfZHd6O2QajiTZGiS8jng4nnH5kyY3xFjFOMbcRxaq1KF15JPVAQl-6en4LlMhGvzXe13qX2haJnRTvVGDAUa4W9_JG8toPUCCfVbqL8TF-GshZr4L9EgHZ6W4g2xUGqbRJjAs0ImImKkWhSDTUi-8jGATaViIV5xgVreVUKA4lgwFYxhpesnqlPwpOIDkJW8w3m0ztj49FPsVRDx8aepxC39l-b1Apuw">online</ref>]
+              </bibl>
+              
+            <bibl xml:id="kiessling_kraken_2019">Benjamin Kiessling: Kraken â€“ an Universal Text Recognizer
+                for the Humanities. In: Digital Humanities 2019 Conference papers. (DH2019, Utrecht,
+                08.â€“12.07.2019) Utrecht 2019. [<ref target="https://dev.clariah.nl/files/dh2019/boa/0673.html">online</ref>]</bibl>
+            <bibl xml:id="kraken_git_2021">Kraken 3.0.4. In: GitHub.io. Ed. by Benjamin Kiessling.
+                2021. [<ref target="https://github.com/mittagessen/kraken/releases/tag/3.0.4">online</ref>]</bibl>
+            <bibl xml:id="lassner_data_2021">David Lassner / Julius Coburger / Clemens Neudecker / Anne
+                Baillot: Data set of the paper Â»Publishing an OCR ground truth data set for reuse in
+                an unclear copyright settingÂ«. In: zenodo.org. 2021. Version 1.1 from 07.05.2021.
+                DOI: <ref target="https://doi.org/10.5281/zenodo.4742068">10.5281/zenodo.4742068</ref>
+            </bibl>
+            <bibl xml:id="mets_loc_2021">METS. Metadata Encoding &amp; Transmission Standard. Home.
+                Ed. by The Library of Congress. Washington D.C. 04.10.2021. [<ref target="http://www.loc.gov/standards/mets/">online</ref>]</bibl>
+            <bibl xml:id="liebl_newspapers_2020">Bernhard Liebl / Manuel Burghardt: From Historical
+                Newspapers to Machine-Readable Data: The Origami OCR Pipeline. In: Proceedings of
+                the Workshop on Computational Humanities Research. Ed. by Folgert Karsdorp / Barbara
+                McGillivray / Adina Nerghes / Melvin Wevers. (CHR2020, Amsterdam, 18.â€“20.11.2020), Aachen 2020, pp. 351â€“373. (= CEUR Workshop Proceedings, 2723) URN: <ref target="https://nbn-resolving.org/urn:nbn:de:0074-2723-3">urn:nbn:de:0074-2723-3</ref>
+            </bibl>
+            <bibl xml:id="ocr_data_2021">OCR-Data. In: GitHub.io. 2021. [<ref target="https://github.com/millawell/ocr-data">online</ref>]</bibl>
+            <bibl xml:id="ocrd_ocrd_2021">OCR-D. Specifications. In: OCR-D.de. WolfenbÃ¼ttel 2021.
+                    [<ref target="https://ocr-d.de/en/spec/">online</ref>]</bibl>
+            <bibl xml:id="ocr_model_2021">OCR/HTR model repository. In: Zenodo.org. 2021. 
+              [<ref target="https://zenodo.org/communities/ocr_models/?page=1&amp;size=20">online</ref>]</bibl>
+            <bibl xml:id="worldcat_oclc_2021">WorldCat. Ed. by OCLC. Dublin 2021. [<ref target="https://www.worldcat.org/">online</ref>]
+              </bibl>
+            <bibl xml:id="padilla_report_2019">Thomas Padilla / Laurie Allen / Hannah Frost / Sarah Potvin
+                / Elizabeth Russey Roke / Stewart Varner: Final Report â€“ Always Already
+                Computational: Collections as Data. In. zenodo.org. Version 1 from 22.05.2019.
+                DOI: <ref target="http://doi.org/10.5281/zenodo.3152935">10.5281/zenodo.3152935</ref>
+            </bibl>
+            <bibl xml:id="pletschacher_page_2010">Stefan Pletschacher / Apostolos Antonacopoulos: The PAGE
+                (Page Analysis and Ground-Truth Elements) Format Framework. In: Proceedings of the
+                20th International Conference on Pattern Recognition. Ed. by IEEE. (ICPR: 20,
+                Istanbul, 23.â€“26.08.2010) Piscataway, NJ 2010, vol. 1, pp. 257â€“260. <ptr type="gbv" cRef="639567843"/></bibl>
+              
+            <bibl xml:id="readcoop_models_2021">Public AI models in Transkribus. Ed. bsy READ-COOP.
+                Innsbruck 2021. [<ref target="https://readcoop.eu/transkribus/public-models/">online</ref>]</bibl>
+             
+            <bibl xml:id="reul_learning_2017">Christian Reul / Christoph Wick / Uwe Springmann / Frank
+                Puppe: Transfer Learning for OCRopus Model Training on Early Printed Books. In:
+                Zeitschrift fÃ¼r Bibliothekskultur 5 (2017), i. 1, pp. 32â€“45. In: zenodo.org. Version
+                1 from 22.12.2017. DOI: <ref target="https://doi.org/10.5281/zenodo.4705364">10.5281/zenodo.4705364</ref>
+            </bibl>
+            <bibl xml:id="ruiz_agreement_2011">Javier Ruiz: Access to the Agreement between Google Books
+                and the British Library. In: Open Rights Group. Ed. by The Society of Authors.
+                Blogpost from 24.08.2011. 
+                [<ref target="https://www.openrightsgroup.org/blog/access-to-the-agreement-between-google-books-and-the-british-library/">online</ref>] </bibl>
+          
+               <bibl xml:id="schoech_textformate_2020">Christof SchÃ¶ch / FrÃ©dÃ©ric DÃ¶hl / Achim rettinger / Evely
+                Gius / Peer Trilcke / Peter Leinen / Fotis Jannidis / Maria Hinzmann / JÃ¶rg RÃ¶pke:
+                Abgeleitete Textformate: Text und Data Mining mit urheberrechtlich geschÃ¼tzten
+                TextbestÃ¤nden. In: Zeitschrift fÃ¼r digitale Geisteswissenschaften 5 (2020). DOI:
+                    <ref target="http://doi.org/10.17175/2020_006">10.17175/2020_006</ref></bibl>
+              
+            
+              
+            <bibl xml:id="springmann_truth_2018">Uwe Springmann / Christian Reul / Stefanie Dipper /
+                Johannes Balter: Ground Truth for training OCR engines on historical documents in
+                German Fraktur and Early Modern Latin. In: The Journal for Language Technology and
+                Computational Linguistics 33 (2018),Â i. 1, pp. 97â€“114. PDF. [<ref target="https://jlcl.org/content/2-allissues/2-heft1-2018/jlcl_2018-1_5.pdf">online</ref>]
+            </bibl>
+            </listBibl>
+            </div>
+               <div type="abbildungsnachweis">
+               <head>List of Figures with
+                Captions</head>            
+                 <desc type="table" xml:id="tab1"><ref target="#tab01" type="intern">Tab. 1</ref>: Responses of library
+                    institutions to our request to grant permission to publish excerpts of the scans
+                    for which they were contractors of the digitization. Most institutions responded
+                    within a few working days and except for the fact that most acknowledged the
+                    public domain of the items, the responses were very diverse. Many answered that
+                    they are either not responsible or only responsible for their Library Copy of
+                    the PDF. [Lassner et al. 2021]
+                   <ref type="graphic" target="#ocr_2021_t1"/></desc> 
+                 <desc type="graphic" xml:id="abb1">Excerpt of a METS file as
+                    used in our data set. For each book, we created one METS file. The link to the
+                    resource contains the identifier and the page number. [<ref type="bibliography" target="#lassner_data_2021">Lassner
+                      et al. 2021</ref>]<ref type="graphic" target="#ocr_2021_001" /></desc>
+                 <desc type="graphic" xml:id="abb2">Excerpt from the PAGE
+                    file showing the bounding box of the line on the page image and the
+                    corresponding text string. [<ref type="bibliography" target="#lassner_data_2021">Lassner
+                      et al. 2021</ref>]<ref type="graphic" target="#ocr_2021_002"/></desc>
+                 <desc type="graphic" xml:id="abb3">Excerpt from the METS
+                    file as used in our data set. For each book, we created one METS file. This part
+                    of the METS file contains the references to the PAGE files. [<ref type="bibliography" target="#lassner_data_2021">Lassner
+                      et al. 2021</ref>]<ref type="graphic" target="#ocr_2021_003"/></desc>
+                 <desc type="graphic" xml:id="abb4">Excerpt from the METS
+                    file as used in our data set. For each book, we created one METS file. Together
+                    with the links to the image resources shown in <ref type="graphic" target="#ocr_2021_001">Figure 1</ref>, and the links to the
+                    PAGE files, the METS file holds the connection between the text lines and the
+                    page images. [<ref type="bibliography" target="#lassner_data_2021">Lassner
+                      et al. 2021</ref>]<ref type="graphic" target="#ocr_2021_004"/></desc>
+                 <desc type="table" xml:id="tab2"><ref target="#tab02" type="intern">Tab. 2</ref>: Performance comparison of
+                    baseline model and fine-tuned model for each document in our corpus. For almost
+                    all documents there is a large improvement over the baseline even with a very
+                    limited number of fine-tuning samples. The sum of lines and characters depicted
+                    in the table do not add up to the numbers reported in the text because during
+                    training we used an additional split of the data as an evaluation set that had
+                    the same size as the test set respectively. [Lassner et al. 2021]<ref type="graphic" target="#ocr_2021_t2"/></desc> 
+                 <desc type="table" xml:id="tab3"><ref target="#tab03" type="intern">Tab. 3</ref>: Performance comparison of
+                    baseline model and fine-tuned model trained on a random splits of samples within
+                    the same group. [Lassner et al. 2021]<ref type="graphic" target="#ocr_2021_t3"/></desc> 
+                 <desc type="table" xml:id="tab4"><ref target="#tab024" type="intern">Tab. 4</ref>: Model performance
+                    evaluated with a leave-one-out strategy. Within each group (German Fraktur and
+                    English Antiqua), an individual model is trained on all samples except from the
+                    left-out identifier on which the model is tested afterwards. The performance of
+                    the fine-tuned model is improved in each case, often by a large margin. [Lassner
+                    et al. 2021]<ref type="graphic" target="#ocr_2021_t4"/></desc> 
+                  
+         </div>
+      </body>
+   </text>
+</TEI>