diff --git a/components/content/embed.tsx b/components/content/embed.tsx index d174af824..8f1f5021b 100644 --- a/components/content/embed.tsx +++ b/components/content/embed.tsx @@ -1,7 +1,7 @@ import type { ReactNode } from "react"; interface EmbedProps { - children: ReactNode; + children?: ReactNode; src: string; /** Added by `with-iframe-titles` mdx plugin. */ title?: string; @@ -20,7 +20,7 @@ export function Embed(props: Readonly): ReactNode { src={src} title={title} /> -
{children}
+ {children != null && children !== "" ?
{children}
: null} ); } diff --git a/components/content/figure.tsx b/components/content/figure.tsx index 74d53afa4..1979b9ee8 100644 --- a/components/content/figure.tsx +++ b/components/content/figure.tsx @@ -8,7 +8,7 @@ interface FigureProps { /** @default "stretch" */ alignment?: FigureAlignment; alt?: string; - children: ReactNode; + children?: ReactNode; /** Maybe added by `with-image-sizes` mdx plugin. */ height?: number; src: string; @@ -22,7 +22,7 @@ export function Figure(props: Readonly): ReactNode { return (
{alt} -
{children}
+ {children != null && children !== "" ?
{children}
: null}
); } diff --git a/components/content/video.tsx b/components/content/video.tsx index 46074c985..c00c8a4b9 100644 --- a/components/content/video.tsx +++ b/components/content/video.tsx @@ -4,7 +4,7 @@ import type { VideoProvider } from "@/lib/content/options"; import { createVideoUrl } from "@/lib/navigation/create-video-url"; interface VideoProps { - children: ReactNode; + children?: ReactNode; id: string; provider: VideoProvider; startTime?: number | null; @@ -27,7 +27,7 @@ export function Video(props: Readonly): ReactNode { src={src} title={title} /> -
{children}
+ {children != null && children !== "" ?
{children}
: null} ); } diff --git a/content/en/people/dariah-teach/index.mdx b/content/en/people/dariah-teach/index.mdx new file mode 100644 index 000000000..6b9b2be85 --- /dev/null +++ b/content/en/people/dariah-teach/index.mdx @@ -0,0 +1,5 @@ +--- +name: DARIAH Teach +image: /assets/images/default-avatar.svg +social: [] +--- diff --git a/content/en/resources/hosted/digital-scholarly-editions-manuscripts-texts-and-tei-encoding-dariah-teach/index.mdx b/content/en/resources/hosted/digital-scholarly-editions-manuscripts-texts-and-tei-encoding-dariah-teach/index.mdx new file mode 100644 index 000000000..96b6ed26b --- /dev/null +++ b/content/en/resources/hosted/digital-scholarly-editions-manuscripts-texts-and-tei-encoding-dariah-teach/index.mdx @@ -0,0 +1,845 @@ +--- +title: "Digital Scholarly Editions: Manuscripts, Texts and TEI Encoding" +locale: en +publication-date: 2017-05-11 +version: 1.0.0 +authors: + - dariah-teach +editors: [] +contributors: + - dariah-teach +tags: + - tei + # - editing-tools + # - encoding + - xml + # - critical-edition +sources: + - dariah-teach +license: cc-by-4.0 +table-of-contents: true +summary: + content: This course will introduce you to the creation of digital scholarly editions, for manuscripts or printed texts, with the help of the TEI and other related technologies. +content-type: training-module +--- + +## Useful Resources + + + +Useful Resources + + + +### Oxygen XML Editor + +To follow the guided exercises in this course, you will need to use the XML editor oXygen: +[https://www.oxygenxml.com/](https://www.oxygenxml.com/). + +oXygen is one of the best XML editors currently available. It has many advantages over other +editors: it is relatively easy to set up and to use, it is very powerful, cross-platform (e.g. it +works on the Mac), relatively cheap and has many advanced features that show that the creators are +keen to keep abreast of new developments in the XML world. Crucially, the developers have shown an +unusually keen interest in the needs of humanities computing scholarship, in particular that which +surrounds the Text Encoding Initiative. + +Its main disadvantages are that it is not free (many XML editors are) and that its target audience +seems to be developers (as opposed to, say, average users just needing to edit XML documents) and so +it is not quite as easy to customise for less technical users as some of its more expensive +competitors. + +Each XML editor has its own advantages and disadvantages, but in editing XML-like documents, the +chief functions you will need initially are the following: + +- _Checking for well-formedness_. Does the document you are working on follow the basic syntax rules + of XML? +- _Validation_. Does the XML document validate against the appropriate DTD or schema (if there is + one)? +- _Editing help_. Different editors offer different kinds of help when editing documents: some show + you which elements (or attributes) are available in a given position (according to the document's + DTD or schema), while others show you the internal structure of an XML document as a 'tree' + diagram so that you can navigate quickly around it. + +XML editors can look deceptively like Microsoft Word or other mainstream text processing programs. +It is useful to have similar functionality as word processors (e.g. spelling checkers), but it is +important to understand that XML editors are quite different, and that a good XML editor will +concentrate on the features that XML enables. + +### XML and TEI Encoding + +Here are a few resources about XML and TEI encoding in general. The Guidelines are the point of +reference for the Text Encoding Initiative, and you are strongly recommended to read the chapters on +[Representation of Primary Sources](http://www.tei-c.org/release/doc/tei-p5-doc/en/html/PH.html), +the [Critical Apparatus](http://www.tei-c.org/release/doc/tei-p5-doc/en/html/TC.html), and +[Names, Dates, People and Places](http://www.tei-c.org/release/doc/tei-p5-doc/en/html/ND.html). +These are the chapters covered more in detail in this course. + +The TEI website provides a gentle introduction to XML, which can be completed by David Birnbaum's +"Even Gentler Introduction to XML". And Finally, TEI by Example offers a series of online tutorials +with plenty of examples, interactive tests and exercises. + +- **TEI Guidelines**: + [http://www.tei-c.org/release/doc/tei-p5-doc/en/html/index.html](http://www.tei-c.org/release/doc/tei-p5-doc/en/html/index.html) +- A Gentle Introduction to XML: + [http://www.tei-c.org/release/doc/tei-p5-doc/en/html/SG.html](http://www.tei-c.org/release/doc/tei-p5-doc/en/html/SG.html) +- An Even Gentler Introduction to XML: + [http://dh.obdurodon.org/what-is-xml.xhtml](http://dh.obdurodon.org/what-is-xml.xhtml) +- TEI by example: [http://teibyexample.org/](http://teibyexample.org/) + +### Digital Scholarly Editing + +The following links are more specifically related to digital scholarly editing. + +The TEI Toolbox was created by Marjorie Burghardt as a tool for people preparing a natively digital +TEI critical edition. The TEI Toolbox lets you check that your critical edition is properly encoded, +and makes it possible to display parallel versions of a text. It also offers access to the TEI +Zoner, a tool to annotate images. + +Two catalogs record digital (scholarly) editions, which give you an idea of the range of +possibilities provided by the digital format. If you are preparing a digital edition, these are also +a great source of examples that you can browse to discover how others have dealt with the same kind +of texts. + +If any term related to scholarly editing is obscure, do not hesitate to consult the Lexicon of +Scholarly Editing. It is an open access and multilingual resource that offers definitions for +concepts in the field of scholarly editing and textual criticism. + +- TEI Toolbox: + [http://ciham-digital.huma-num.fr/teitoolbox/](http://ciham-digital.huma-num.fr/teitoolbox/) +- Patrick Sahle's catalog of Digital Scholarly Editions: + [http://www.digitale-edition.de/](http://www.digitale-edition.de/) +- Greta Franzini's catalog of Digital Editions: + [https://dig-ed-cat.acdh.oeaw.ac.at/](https://dig-ed-cat.acdh.oeaw.ac.at/) +- Lexicon of Scholarly Editing: [http://uahost.uantwerpen.be/lse/](http://uahost.uantwerpen.be/lse/) + +### Annotating Images + +Here are a few tools that let you create TEI markup for images annotations. For more detail, take +the optional module on [Image Annotation]($@COURSEVIEWBYID*32@$§ion=5). + +- Image Markup Tool (PC + only):[ https://tapor.uvic.ca/~mholmes/image_markup/](https://tapor.uvic.ca/~mholmes/image_markup/) +- InkScape: [https://inkscape.org/en/](https://inkscape.org/en/) +- Oxygen + Plugin:[ https://github.com/oxygenxml/TEI-Facsimile-Plugin](https://github.com/oxygenxml/TEI-Facsimile-Plugin) +- TEI Zoner: + [http://ciham-digital.huma-num.fr/teitoolbox/zoner.php](http://ciham-digital.huma-num.fr/teitoolbox/zoner.php) + +### XML Transformation + +Here are three resources to help you transform your TEI XML in other formats. + +- TEI boilerplate : [http://dcl.ils.indiana.edu/teibp/](http://dcl.ils.indiana.edu/teibp/) +- XSL stylesheets for TEI XML: + [http://www.tei-c.org/release/doc/tei-xsl/](http://www.tei-c.org/release/doc/tei-xsl/) +- Transforming Data for Reuse and Re-publication with XML and XSL: + [http://programminghistorian.org/lessons/transforming-xml-with-xsl](http://programminghistorian.org/lessons/transforming-xml-with-xsl) + + + + + +## Course Materials + + + +# Introduction: Encoding and XML + +## Why Do We Encode? + + + +Why Do We Encode? + + + +Advantages of encoding texts with XML. + + + + + + + + + +## Why Do We Standardize? + + + +Why Do We Standardize? + + + +The Text Encoding Initiative, a common standard for text encoding in the Humanities. + + + + + + + + + +## Encoding a Text + + + +Encoding a Text + + + + + + + +## Displaying a Text + + + +Displaying a Text + + + + + + + +## The XML Constellation: How the Technologies Work Together + + + +The XML Constellation: How the Technologies Work Together + + + +XML works together with other technologies: + +- Schema - set of rules which define the structure of the document (such as the TEI); +- Transformation - conversion from XML to a different format with XSLT or CSS. + + + + + + + + + +# Manuscript Transcription + +## Analysis and Modelling + + + +Analysis and Modelling + + + + + + + +## Pages and Facsimiles + + + +Pages and Facsimiles + + + + + + + +## Phrase-Level Features + + + + + +In this lesson, you will learn to encode various textual features which may appear at the phrase +level: abbreviations, scribal interventions (such as additions, omissions), normalisations, +modifications by the editor, and parts that are difficult or impossible to read. + + + + + +Introduction + + + + + + + + + +Abbreviations + + + + + + + + + +Scribal Interventions: Additions, Deletions and Substitutions + + + + + + + + + +Normalisations + + + + + + + + + +Modifications by the Editor + + + + + + + + + +Difficult Readings and Missing Parts + + + + + + + + + +Exercise (test, to edit) + + + +Here are some images from manuscripts. How would you encode the texts, using the elements that you have learned about in this lesson? + +... (I could also create an assignment in the main section) + + + + + + + +## Quiz + +## Transcription Practice + + + +Transcription Practice + + + +Practice the encoding of phrase-level features with the images provided in the course materials for week 2. + +The images are from two manuscripts of +[Calpurnius Flaccus](https://en.wikipedia.org/wiki/Calpurnius_Flaccus), a Latin author from the +second century AD. The manuscripts, +[CLM 309](http://daten.digitale-sammlungen.de/~db/0009/bsb00090859/images/index.html?id=00090859&groesser=&fip=193.174.98.30&no=&seite=294) +(B) and +[CLM 316](http://daten.digitale-sammlungen.de/~db/0009/bsb00090394/images/index.html?id=00090394&groesser=&fip=193.174.98.30&no=&seite=9) +(M), are held in Münich and are both available online at the Bayerische Staatsbibliothek website. +The full text is available at the Packard Humanities website +[here](http://latin.packhum.org/loc/1100/1/0#0). + +Remember to consider the following types of features: + +- Scribal interventions + - addition + - omission + - substitution +- Editorial interventions + - regularization of spelling: for instance, regularize the e caudata (ę) to _ae_ or _oe_ + - expansion of abbreviations + - supplied text +- Difficult readings or damaged page + +If you need help with abbreviations, check Capelli's dictionary of Latin abbreviations: +[http://www.hist.msu.ru/Departments/Medieval/Cappelli/](http://www.hist.msu.ru/Departments/Medieval/Cappelli/). + + + + + +# Critical Editions and the TEI + +## Understanding Critical Editions + + + +Understanding Critical Editions + + + + + + + +## The TEI Proposal + + + +The TEI Proposal + + + + + + + +## Positive and Negative Apparatus + + + +Positive and Negative Apparatus + + + +The critical apparatus can take different format: positive or negative apparatus. + + + + + + + + + +## Encoding a Critical Edition + + + +Encoding a Critical Edition + + + + + + + +## Quiz + +# Critical Editions: Advanced Features + +## Indexing + + + +Indexing + + + + + + + +## Names: People, Places and Organisations + + + + + +In this lesson, you will learn to encode names of people, places, and their relationships to +organisations. + + + + + +People Names + + + + + + + + + +People, Places and Organisations + + + + + + + + + +Place Names + + + + + + + + + +## Encoding Dates + + + +Encoding Dates + + + + + + + +## Exercises + + + +Exercises + + + +### Exercise 1 + +Encode Names, Dates, Places and their references in the first letter of Mary Shelley's Frankenstein. + +### Exercise 2 + +Create lists of Persons, Places and Organizations in the extracts from William Dalrymple's City of +Djinns: - Encode at least two persons with full biographical details + +- Add the relationships between persons, places and organisations +- Tag at least one instance of each person, place and organisation in the text +- Create an index of the names in the text, along with a page where they are mentioned + +Feel free to encode as many entities as you wish, as well as dates. You can check Wikipedia in order +to gather information about the Mughal emperor +[Shah Jehan](https://en.wikipedia.org/wiki/Shah_Jahan) and his family, or about the writer +[William Dalrymple](). + + + + + +# Annotating Images (Optional) + +## Annotating Images + + + +Annotating Images + + + + + + + +## Annotate a Medieval Bestiary + + + +Annotate a Medieval Bestiary + + + +### Medieval Bestiaries + +Bestiaries are illustrated books which describe animals. They were very popular during the Middle +Ages, and the +[British Library](https://www.bl.uk/catalogues/illuminatedmanuscripts/TourBestiaryGen.asp) offers a +tour of its most interesting bestiaries in illuminated medieval manuscripts: + +"A bestiary is a book of real and imaginary beasts, though its subjects often extend to birds, +plants and even rocks. Long perceived merely as rudimentary natural histories, medieval bestiaries +actually reflect the belief that the natural world was designed by God to instruct mankind. They +describe the physical nature and habits of animals in order to elaborate on the moral or spiritual +significance of these characteristics." (British Library) + +The illustrations in your course material are from the manuscript +[Harley 3244](https://www.bl.uk/catalogues/illuminatedmanuscripts/record.asp?MSID=8798&CollID=8&NStart=3244), +an English codex of the 13th century. + +- Folio 39v: + [https://www.bl.uk/catalogues/illuminatedmanuscripts/ILLUMIN.ASP?Size=mid&IllID=21497](https://www.bl.uk/catalogues/illuminatedmanuscripts/ILLUMIN.ASP?Size=mid&IllID=21497) +- Folio 57v: + [https://www.bl.uk/catalogues/illuminatedmanuscripts/ILLUMIN.ASP?Size=mid&IllID=21535](https://www.bl.uk/catalogues/illuminatedmanuscripts/ILLUMIN.ASP?Size=mid&IllID=21535) +- Folio 58v: + [https://www.bl.uk/catalogues/illuminatedmanuscripts/ILLUMIN.ASP?Size=mid&IllID=21536](https://www.bl.uk/catalogues/illuminatedmanuscripts/ILLUMIN.ASP?Size=mid&IllID=21536) + +### Exercise + +Create zones for each illustration on the pages with the help of the +[TEI Zoner](http://ciham-digital.huma-num.fr/teitoolbox/zoner.php). Then link the zones, in the +<facsimile> element in your TEI file, to annotations about the animals in the <text> +part of your TEI file. + +**Bonus**: can you create an index of animals in alphabetical order, along with the folio(s) where +they appear? The result should look like this: + +- bee: 57v, 58v +- dove: 58v +- dragon: 39v, 58v +- duck: 57v +- elephant: 39v + + + + + +# TEI and XSLT (Optional) + +## Introduction + + + +Introduction + + + + + + + +## XPath + + + +XPath + + + + + + + +## Templates and Namespaces + + + +Templates and Namespaces + + + + + + + +## XSLT Transformations + + + +XSLT Transformations + + + + + + + +## Exercise + + + +Exercise + + + + + +#### Assignment: + +We've provided a slightly more complex version of the `snippet.xml` ... + +- Open the `snippet-task.xml` file +- Figure out which is the new element +- Open the `snippet.xslt` file +- Do the following: - Add a new template that handles the new element, showing it in BOLD in the + output - Change the template that handles the <title> element so that instead of using + quotation marks, it uses the element `` - Save your improved program as + `snippet-task.xslt` - Test this in the debugger - Once you have it working, make a transformation + scenario for it + + + + diff --git a/content/en/resources/hosted/text-encoding-and-the-tei/index.mdx b/content/en/resources/hosted/text-encoding-and-the-tei/index.mdx new file mode 100644 index 000000000..1fcc879e1 --- /dev/null +++ b/content/en/resources/hosted/text-encoding-and-the-tei/index.mdx @@ -0,0 +1,2189 @@ +--- +title: 'Text Encoding and the TEI' +locale: en +publication-date: '2016-01-27' +version: 1.0.0 +authors: + - dariah-teach +contributors: + - dariah-teach +tags: + - dh +sources: + - dariah-teach +license: cc-by-4.0 +table-of-contents: true +summary: + content: 'This course introduces the theory and practice of text encoding using the Guidelines of the Text Encoding Initiative.' +content-type: training-module + +--- + +# Text encoding and the Text Encoding Initiative + +## Introduction + + + + +Introduction + + + + +### What this Course is About + +This course is an introduction to the theories, practices, and methods that are used in the humanities for the encoding of texts for research, for preservation, and for online distribution. It focuses on a particular method, that of text encoding, using eXtensible Markup Language (XML), and a specialised schema common to humanities research, The Text Encoding Language (TEI). + +While there are other equally valid ways to store, preserve, and distribute textual data (such as using a relational structure), this course will not cover those methods. Rather, this course focuses on text encoding, its history and uses. There are also practical hands-on exercises which allow students to practice the theory learned. + +This course is divided into three main units, with each lesson further subdivided into lessons: + +- Unit I is an introduction to textual scholarship, modelling, and markup languages: what they are, their history, and some of their uses and characteristics. This unit focuses on XML, how it was developed, its structure and form, and the standard used in the rest of the course, the Text Encoding Initiative (TEI); +- Unit II delves into XML in more detail: discussing the building blocks of XML and an overview of its rules and structures. This unit also introduces schema and DTDs, with exercises on how to model XML via a DTD; +- Unit III provides opportunities to apply what you have learned in the previous units. This begins with a brief overview of the TEI Guidelines, along with two exercises applying principles you have learned. + +### Who Created This Course + +**Susan Schreibman** is Professor of Digital Humanities and the Director of An Foras Feasa, the Humanities Institute at Maynooth University. She is the Coordinator of the #dariahTeach consortium. Professor Schreibman has been involved with the TEI community for many years, serving on both the Consortium's Board and Technical Council. Professor Schreibman is the originator and editor of [The Versioning Machine](http://v-machine.org), a tool to compare and display multiple versions of text encoded according to one of the methods described in the TEI's Apparatus chapter. + +**Roman Bleier** holds a PhD in Digital Arts and Humanities and Medieval History from Trinity College Dublin where he worked on a digital edition of St Patrick's epistles based on diplomatic transcriptions encoded in TEI. His dissertation in Trinity College was inspired by previous work he did at the Royal Irish Academy where he was part of the 'St Patrick’s Confessio HyperStack Project' team. Roman is currently a DixIT Postdoctoral Fellow at the University of Graz and his research topic is 'Canonical reference and sustainability of digital editions'. He is also the Technical Editor of The Versioning Machine and member of the Institute for Documentology and Scholarly Editing (IDE). + +### Acknowledgements + +The authors of this course gratefully acknowledge the assistance of Conor Ryan who helped to improve the course through his practicum in the MA in Digital Humanities at Maynooth University; Shane McGarry, a PhD candidate in the Digital Arts and Humanities at Maynooth University, who ran an early focus group on the course, providing invaluable feedback for improvement, as well as co-supervised Conor Ryan's practicum. + +We also acknowledge the valuable input of Seamus Callagy, Mei Dong, Michelle Doran, and Mariana Sylivrili in early testing of the course. + + + + +## Announcements + +# Unit I: Markup languages: their history and uses + +## Markup Languages and text modelling + + + + + + + +Textual Scholarship, Markup and Modelling + + + + +Modelling has always been a core activity of textual scholarship as the human record has been migrated from oral traditions to written ones, from handwriting on any number of surfaces, from parchment to stone, in caves, on tombstones, sacred and profane, to print, and more recently to electronic forms of composition, storage, and distribution. + +For centuries textual scholars have made choices in disambiguating marks on page, in deciding how to format text as it migrates from one form to another, to show changes in a work over time, be they authorial (i.e. created by the author of the work) or by other agents (editors, relations, friends, or unknown scribes). + +In the eighteenth and nineteenth centuries as print became the de facto medium in which to transmit en masse the textual record, be it for pleasure (the rise of the novel), for religious purposes (the reproduction of sacred texts and commentary), for or political purposes (as a way of defining the new nation states and their peoples by their literary inheritance), so too the need arose for individuals to adjudicate on the version of the text to be published, what introductory material might be included, or which texts should be group ![](/assets/content/assets/en/resources/hosted/text-encoding-and-the-tei/e24d3b6500c680bda715ff2d4876f202b94967db.jpg)together and published as collections. + + +As publishing became an industry in its own right, a text might pass through many hands before it was published. Long before the invention of computers, markup was used to annotate a text to instruct a compositor or typist how a particular passage should be printed or laid out. + +Examples, familiar to proofreaders and others, include a caret to add in extra text, special symbols for passages to be omitted or printed in a particular font, and so forth. Even today, although texts are now, by and large set through soft type on a computer (as opposed to hard type which was done manually with slugs for each character) when publishers return proofs to authors or editors, it is still the norm to print out the proofs and use these same symbols to indicate what changes need to be made to the text. + +With the rise of computers to generate text (either for print or electronic publication) the term markup was extended to cover all sorts of special codes to govern formatting, processing and analysis. Textual scholarship has adapted to suit this relatively new medium. To make the textual record suitable for digital re-presentation, metadata in the form of specialised markup languages are used. This will be explained in much greater detail in the next section. + + + + + + + + + +Characteristics of Markup Languages and Types of Markup + + + + +As mentioned in the previous section, markup languages are used to add meta information to data. Data can be anything, it could be the settings of a computer programme, it could be entries in a telephone book or a library catalogue, it could be prose texts or poetry. Markup languages provide a way to describe this data. Markup languages should be distinguished from programming languages. Markup does not, in itself, process or style text. It must be used in conjunction with either a programming or scripting language, such as XSLT or CSS. + +In the case of HTML, for example, HTML tags are used to structure the text (paragraphs, titles, footnotes, etc), to add in other media, such as images, audio, or video files, and to add elements that allow scripting to be embedded into the HTML page. Browsers can interpret these HTML elements to process the text so that it not only appears on your monitor, but links via hyperlinks to other pages, displays images in the appropriate place and the appropriate size, etc. + +HTML and LaTeX are examples of **presentational or procedural markup languages**. They are used primarily to indicate how text and data is supposed to be processed by a software. For instance, HTML developers are primarily concerned with how something looks in the web browser. However, in recent years with the development of HTML5, HTML has becomes more similar to **descriptive or semantic markup**. This kind of markup language describes the data that is encoded, eg a text an address, a pizza menu, or a stanza of a poem. The benefit of this kind of markup is that it can be understood by software such as search engines. A typical example of a descriptive markup language is the Text Encoding Initiative (TEI). Markup is typically not visible to the reader of electronic text. Rather, it serves as instructions for software to process text. + +| | +|---| + + + + + + + + + +Markup Languages vs Plain Text + + + + +As mentioned previously, markup languages both inherit characteristics from earlier print-based markup practices, while extending and augmenting those practices for electronic reading, analysis, and distribution. Markup can be expressed, as we saw previously, via handwritten characters or symbols. For computer-based processing, the markup or annotation vocabulary must be itself expressed in machine readable form in a standardised manner. + +But why, you might ask, do we even need markup? Why not simply use plain text? Plain electronic texts, such as txt-files downloaded from online archives such as Project Gutenberg, do usually not contain any markup. For a human reader this tends not to be a problem as long as she knows the language, writing system, and genre conventions. For example, most readers of Indo-European languages will recognise the form of the following genre types, without needing to read the content: + +| ![](/assets/content/assets/en/resources/hosted/text-encoding-and-the-tei/12e535814338e541c9ee62c66827b2f87676456e.png) | ![](/assets/content/assets/en/resources/hosted/text-encoding-and-the-tei/fb98bcd1f127a69d7f4190ef21f77ddd9e553bfb.png) | +|---|---| + + + +We learn how to read texts, how to distinguish individual words, and how to interpret words in the context of their linguistic structure. Even if texts are written without spaces between the words or without punctuation in scriptura continua, a human reader, used to this writing system, can identify words and sentences and hence discern their meaning. + +Similarly, Cross-Hatch writing, a common convention in the Eighteenth Century to save money on post, in which the author of the letter would write horizontally across the page, and then when she reached the bottom of the page, would turn it vertically and write across the previous writing. Ancient manuscripts written in scriptura continua and Eighteenth Century cross-hatch letters proof that humans can adapt to writing conventions that lack white space and/or punctuation. For computers, however, plain text is more difficult to format, and can be more difficult to process, particularly when features of individuals words (such as parts of speech), grammatical constructs, or structures of meaning within a text need to be taken into account. + +| ![](/assets/content/assets/en/resources/hosted/text-encoding-and-the-tei/c17de6f9b1c9ca01635503424751dd493b9b595d.png) | +|---| +| ![](/assets/content/assets/en/resources/hosted/text-encoding-and-the-tei/81088c251e7743b7204ea63d6121c98db7b803f0.jpg) | + + + +Hence markup is an important enhancement that expands the potential of electronic texts. Markup languages tend to be human readable (i.e. they do not need to be compiled like programming languages) as well as descriptive for example, this fragment of HTML indicates that there are two headings, and the first is more major: + +``` +<h1>Markup Languages and Text Modellingh1> +<h2>Markup Languages vs Plain Texth2> +``` + + + + + + + + +This type of machine-readable inline annotation is also referred to as meta-information or metadata as it enhances the primary text for computers to process further, as opposed to earlier processing instructions, such as format-17 to indicate a title of a work. + +Early forms of electronic markup languages were used in the 1960s. In a humanities context the software COCOA used a form of markup language that was very influential. COCOA was used for the analysis of texts and to generate indexes of words and concordances of texts. These texts had to be prepared and enriched with markup for annotation and meta information (for further information, see Hockey 27-30). The following fragment shows a short example from a dramatic text. Author of a work, the Title of the work, the Act, and Scene, etc. could be encoded and processed with COCOA: + +``` + + +``` + + + + + + + + +Another example of a markup language is LaTeX. LaTeX is a language that is used to describe documents for typesetting. LaTex is being developed since the 1970s and one of the most frequently used languages for typesetting in the print industry. For instance, the above HTML example could be written LaTeX in the following way: + +``` +\begin{document} +\chapter{Markup Languages and Text Modelling} +\section{Markup Languages vs Plain Text} +\end{document} +``` + + + + + + + + +Markup languages are used to enrich plain text with information which is readable by both humans or machines. The previously mentioned HTML and XML, the language we will learn in this course, both derive from SGML (Standard Generalised Markup Language). SGML became an international standard tin 1986 and is based on an earlier language called GML (Generalised Markup Language). SGML is not an encoding scheme per say, 'but a syntax or a framework within which encoding tags can be defined' (Hockey 33). SGML, and hence XML, provides a syntax in which communities of practice can describe tags, and hence the concepts, that are important to capture in their view of texts and textuality. All these languages follow the same convention, specifically angle brackets to distinguish the annotation, known as tags and elements, from the primary text. It is the same in HTML and XML. Recall the above HTML example: + +``` +<h1>Markup Languages and Text Modellingh1> +<h2>Markup Languages vs Plain Texth2> +``` + + + + + + + + +There is always an opening tag (e.g. <h1>) and a closing tag (e.g. </h1>) indicating the start and the end of a heading. These tags designate a semantic function of the text. TEI, the encoding standard of the Text Encoding Initiative, is an XML-language and the same convention applies and angle brackets are used to distinguish tags from data. TEI, however, has a much larger and richer vocabulary of elements to capture many more features across a wide range of text types, from tombstones to manuscripts to the born digital. + +**Further Reading** +SGML Users' Group. 'A Brief History of the Development of SGML'. 1990. Online. +Hockey, Susan M. *Electronic Texts in the Humanities: Principles and Practice*. Oxford ; New York: Oxford University Press, 2000. + + + + + + + + +## What is text really? + + + + + + + +The OHCO (Ordiered Hierarcy of Content Objects) model + + + + +One of these theories comes directly out of the inherent structure of a markup language such as SGML (discussed in the Markup Languages and Text Modelling section), and its derivative languages, including HTML and XML. SGML structures data hierarchically. In other words, elements must be nested within one another like a series of Russian dolls. Moreover, there are rules languages (Schema and DTDs, discussed in the Modelling with XML section that follows) that govern for example, where, how frequently, and if elements can be used one or more times within another element. + +Consequently a meta-language like XML or SGML enforces a particular way of thinking about texts. In the late 1980s and into the 1990s a group of scholars including Allen Renear, Elli Mylonas, and David Durand developed a theory for the practice of text encoding specifically, and text more generally. It was called an ‘Ordered Hierarchy of Content Objects’ (OHCO). They proposed that although different kinds of texts may be organised differently, they still have a hierarchical organisation. In their first iteration of the OCHO model, they asserted + +> Within the lowest level subsections are objects like paragraphs, sentences, prose quotations, verse quotations, equations, proofs, theorems, and so on. Many of these objects can be decomposed further. This structure is hierarchical because these objects “nest” inside one another like Chinese boxes. It is ordered because there is a linear relationship to objects – for any two objects within a book one object comes before the other. (Renear) + +These nesting objects are more familiar, in books, for example, chapters, sections, paragraphs, lists, and so forth. Like the Russian doll example, these content objects fit neatly into one another, from the smallest (a letter or a word) to the largest (a book or monograph), with a myriad of other nested units in between (sentences, paragraphs, chapters, sections, etc). + +While this theory was instrumental in the development of the TEI, it never fully accounted for the problem of overlapping hierarchies. Overlapping hierarchies breaks the neatly nesting pattern described above. For example, a metaphor in a poem may cut across two or more lines (marked by the tag <l>). It might seem like a purely technical issue that a language like XML requires one element to close before another opens, as in the following: + + +- <l>text text text</l> +- <l>text text <metaphor> text text</metaphor></l> + +as opposed to the following: + + +- <l>text text <metaphor> text text</l> +- <l>text text </metaphor></l>. + +The creators of the OHCO theory concede that this may be more than a technical issue and that it may point to some of the thorniest issues surrounding text encoding as an intellectual endeavor. Textual editors such as Jerome McGann, D.F. McKenzie and Peter Schillingsburg had been developing different understandings of text. They found the OHCO model was too simplistic a model to represent complex works, particularly literary. Essentially they argued that texts can have multiple structures and diverse meanings, some outside the text itself. These vying approaches to textual editing cumulated in ‘mock confrontation’ between Renear and McGann at the Digital Humanities conference at the University of Virginia in 1999. McGann argued for the complex and overlapping structures of texts using poetry as an example: + +> poetry is not organized in a determinate hierarchy. TEI and SGML markup, therefore, while reasonably adequate vehicles for expository and informational texts, fails to render those features of poetic text that are most salient for its makers and users. Poetical texts are recursive structures built out of complex networks of repetition and variation. No poem can exist without systems of ‘overlapping structures (Hockey) + +For example, metaphors may span many lines or stanzas of verse. Narrative events may span many paragraphs and indeed may overlap. Verse drama contains dialogue lines (speeches), metrical lines, and sentences. But these sentences and metrical lines overlap in the case of enjambment or when a character begins talking and another interrupts (Renear). All these hierarchies have equal claim to representation. + + +Thus the theory of text that SGML (and hence XML and TEI)most eloquently expresses is what one might term the editorial or bibliographic; that is, representing the text in terms of sentences, paragraphs, chapters, front and back matter, and so on. This is not surprising given SGML’s roots as a language written to publish documentary texts in electronic form. From this point of view, one might deduce that the documentary view of text can be read as its *only* structure. + +While Renear and his colleagues argued for the OHCO model through the the mid-2000s in a series of articles that further refined and developed their original OCHO proposition, within the TEI community, most academics favour McGann’s view (while necessarily having to conform to an OCHO model as that is what the language most natively facilitates) in which texts encoding is approached from the perspective of complex text structures that do not necessarily all fit into a rigid hierarchy. +**Further Reading** + +Hockey, Susan, Alan Renear, Jerome McGann. '[Panel: What is text? A Debate on the Philosophical and Epistemological Nature of Text in the Light of Humanities Computing Research.](http://) Web. 29 August 2016. + +McGann, Jerome. *Radiant Textuality: Literature After the World Wide Web.* London: Palgrave, 2001. + +Renear, Allen, Elli Mylonas, and David Durand. [“Refining Our Notion of What Text Really Is: The Problem of Overlapping Hierarchies](http://.).” N.p., 6 Jan. 1993. Web. 17 Sept. 2012. + +Scholen, David and Sandra Scholen. '[Beyond Gutenberg: Transcending the Document Paradigm in Digital Humanities](http://www.digitalhumanities.org/dhq/vol/8/4/000196/000196.html)'. Digital Humanities Quarterly. Vol. 8 No. 4. 2014. + +Shillingsburg, Peter L. *Scholarly Editing in the Computer Age: Theory and Practice.* Ann Arbor: University of Michigan Press, 1996. + + + + + + + + + +So what is text? + + + + +Until recently, most textual editors worked in disciplines that recorded the written record -- not only in manuscript and print form -- but in any text bearing object, including stone, metal, and wood. In the 1980s, D.F. McKenzie, in *Bibliography and the Sociology of Texts* argued for a more embracive conception of text, to include any form of transmission, not simply of the written word, but of ideas, whatever form they may take. He thus defines text as any 'verbal, visual, oral, and numeric data in the form of maps, prints, and music, of archives of recorded sound, of films, videos, and any computer-stored information, everything in fact from epigraphy to the latest forms of discography.' (13) McKenzie was prescient in including the born digital: texts that originate on a computer which include not only machine readable text, but multimedia and code. + +Thus textual editing in the 21st century includes not simply the forensics traditionally associated with textual editing of tracing a text over time back to the original version or sifting through an author's manuscripts to recreate the writing process, but uncovering the trail of correction and revision by examining the drives and discs of an author (see Matthew G. Kirschenbaum's *Mechanisms).* + +After the Second World War, textual editors began to move to computers to model text, originally for text types such as concordances and indexes. But increasingly, particularly after the advent of the World Wide Web, it is a medium that is surpassing print as a way to make available scholarly editions. The ways in which the text of these editions is marked or tagged underpins underlying theoretical perspectives embedded in the markup itself. + +**Further Reading** + +Kirschenbaum, Matthew G. *Mechanisms: New Media and the Forensic Imagination. Cambridge Mass*: MIT Press, 2007. + +D.F. McKenzie. *bibliography and the Sociology of Texts*. Cambridge: Cambridge University Press, 1999. First published 1985. + + + + + + + + +## Modelling with XML + + + + + + + +XML modeling languages + + + + +XML, like SGML before it, allows you to create your own element names. In the example used earlier we could have used the element <pers> instead of <personName> and <townName> instead of <town>: + +| ``` 1 2 3 4 5 ``` | ``` Robert Smith Castle street 123 Dublin 0891232245 ``` | +|---|---| + + + + + +Therefore, the above example could also look like this: + +| ``` 1 2 3 4 5 ``` | ``` Robert Smith Castle street 123 Dublin 0891232245 ``` | +|---|---| + + + + + + + +The XML encoding still successfully describes the encoded data and there are probably many tens of ways of naming these elements. For humans using similar element names may not make a difference because we realise that <pers> or <personName> or <persName> refers to a person's name. However computers typically work on string matching, thus the elements <pers>, <personName> and <persName> represent three different things, as unrelated as <pers> <dog> <fish>. + +To help enforce uniformity in element naming and syntax, SGML used a language called Document Type Definition (DTD). DTDs can also be used with XML. DTDs are a set of rules that specify what elements are used in an SGML/XML document, how frequently they can be used, and what attributes they can contain. + +DTDs represent an abstract model of the structure of an XML document, while providing a mechanism for a an individual or community to develop a vocabulary of element names that suits the data they are encoding. In due course, the W3C, the organisation that monitors and develops HTML and XML standards, developed a rules-based language specifically for XML documents written in XML syntax. While DTDs can still be used with XML, the more powerful and flexible XML Schema and RNG are more commonly used. Later in this course we will introduce you to DTD syntax as it is easier to learn and use than schema languages, while the general principles of modelling are similar. + + + + + + + + + +Why would you model a text with XML? + + + + +XML is the acronym for eXtensible Markup Language (XML). XML was developed in the 1990s and is based on Standard Generalised Markup Language (SGML). The original function of SGML was to provide a software-independent language for the formatting of text. SGML, as is its derivative XML, more properly a metalanguage whose syntax and structure can be used to describe vocbularies for specific domain use. Hypertext Markup Language (HTML), the language that powers the World Wide Web, is probably the most well-known SGML vocabulary. Because SGML was developed in a pre-internet environment, in the late 1990s XML was developed to overcome some shortcomings of SGML and to provide a lightweight and easy-to-learn markup language for data exchange. Today XML is a central language for data exchange and many other languages such as XHTML, SVG or RDF are based on XML syntax. + +| | +|---| + +XML is a descriptive or semantic markup language. With XML all kinds of data can be encoded in a way so they can be understood not only by humans, but also computers. As humans we can understand an address information such as: + +``` +Robert Smith Castle street 123 Dublin 0891232245 +``` + + +However, even for humans it might not be clear that the last 10 digits are a telephone number. Both SGML and XML allow you to describe plain text in a semantically-rich way. For example, you could tag the person name, street, town and telephone number separately with XML elements that indicate what each part of this text is: + +| ``` 1 2 3 4 5 ``` | ``` Robert Smith Castle street 123 Dublin 0891232245 ``` | +|---|---| + + + + + +Now it is much clearer what each element of the text represents with the added benefit that the XML elements can be processed by a computer programme. If you had hundred address entires all encoded in the same way, software can easily generate a list of telephone number or search only through the person names to check if a 'Robert Smith' is among your list of addresses. + + + + + + + + + +Overlapping hierarchies + + + + +One of drawbacks of modelling with XML is that XML does not allow for overlapping hierarchies. The issue was already mentioned in the context of the discussion of OHCO (in the 'What is Text Really Section'). Essentially, the issue is that XML requires a hierarchical data model and all elements need to be nested within other elements. This means that an XML element cannot be opened inside of one element and closed in another. The correct nesting of elements was this example used previously: + +| ``` 1 2 ``` | ``` text text text text text text text ``` | +|---|---| + + + + + + +In the above example, the first <l> element opens and closes, then the second <l> element opens, and within it the <metaphor> tag is opened and closed, and only then is the <l> element closed. On the other hand the next example is incorrect because the metaphor tag breaks the nesting by starting within one <l> element and ending in the next: + +| ``` 1 2 ``` | ``` text text text text text text . ``` | +|---|---| + + + + + +Yet, the second representation may more accurately represent the content. A metaphor in a poem may cut across two or more lines (marked by the tag <l>). It might seem like a purely technical issue that a language like XML requires one element to close before another opens, as in the following: The creators of the OHCO theory concede that this may be more than a technical issue and that it may point to some of the thorniest issues surrounding text encoding as an intellectual endeavor. Text encoding, like any other area of textual scholarship, is not theory-free. It is subjective, theoretical, and interpretative. Texts, particularly literary texts, have competing hierarchies, all of which may have equal claim to being represented as they express different views of the text. For example, the hierarchy that SGML and hence XML most eloquently expresses is what one might term the editorial or bibliographic; that is, representing the text in terms of sentences, paragraphs, chapters, front and back matter, and so on. This is not surprising given SGML’s roots as a language written to publish documentary texts in electronic form. From this point of view, one might deduce that the documentary view of text can be read as its *only* structure. + +Yet there are many textual features that do not conform to this hierarchy. As mentioned previously, metaphors may span many lines or stanzas of verse. Narrative events may span many paragraphs and indeed may overlap. Verse drama contains dialogue lines (speeches), metrical lines, and sentences. But these sentences and metrical lines overlap in the case of enjambment or when a character begins talking and another interrupts (Renear 119–21). All these hierarchies have equal claim to representation. There are possiblities to get around this problem by using empty elements and they will be discussed in a later section. + +Further Reading + +Text Encoding Initiative, Non-hierarchical Structures, <[http://www.tei-c.org/release/doc/tei-p5-doc/en/html/NH.html>](http://www.tei-c.org/release/doc/tei-p5-doc/en/html/NH.html) + +Renear, Allen, Text Encoding, A Companion to Digital Humanities, ed. Susan Schreibman, Ray Siemens, John Unsworth. Oxford: Blackwell, 2004, <[http://www.digitalhumanities.org/companion/](http://www.digitalhumanities.org/companion/)> + + + + + + + + + +XML and its advantages + + + + +The Extensible Markup Language (or XML) is a specification of the World Wide Web Consortium (W3C). As discussed in the last unit, XML is a metalanguage and can be used to describe custom markup languages that conform to the basic rules of XML. The entire XML specification can be found on the W3C website at https://www.w3.org/XML/ + +Today XML is used in the world of technology and in particular on the Internet. For instance, XML-based markup languages such as XHTML, an XML version of HTML, SVG, a standard to describe graphics, and RDF, a standard that is central to the semantic web, are central to the Internet today. Since 2004 also TEI, the encoding recommendations of the Text Encoding Initiative, uses XML. XML was originally developed as a standard for data exchange over the web and its main advantages are that it is a device independent standard, that it is machine and human readable, and that XML promotes a clear separation between data and presentation. + +#### XML is device independent + +One of the main advantages of XML is its high degree of interoperability. XML data can be used by different programmes on different platforms. XML files are simple text files and are not platform or software dependent. Consequently, an XML document can be written on a Mac, stored on a Linux server and downloaded to a Windows PC. All these operating systems are able to read an XML file. At the same time, XML can be used to exchange data between different programmes. For instance, since Microsoft Word introduced its xml-based document format, Word documents can be viewed and edited by other software such as LibreOffice. + +#### XML is machine and human readable + +Another advantage of XML is that it can be written, read and altered by humans as well as computers. No special software is required to access the data in XML documents. A simple text editor such as Notepad, Vi or TextEdit can be used to open and edit XML documents. + +For instance, before Microsoft Word used XML it was very difficult to rescue your data from a corrupted Word document. Now, XML content of a Microsoft document can easily be extracted and viewed with a simple text editor. The following video will show you how you can open a Microsoft Word document and read its XML content. Try to follow it on your own computer and stop the video if necessary. Instead of a Microsoft Word document, you could also try the exercise with a LibreOffice document. + +| | +|---| + +#### XML promotes a clear separation of data and presentation + + +The third major advantage of XML documents is that XML promotes a clear separation of data and presentation. This enables the possiblity that XML encoded data can be exchanged without presentational information attached. On the one hand this reduces file size overhead and is an advantage for the exchange of huge amounts of data over networks, on the other hand it allows for the presentation of the same data in different formats. Using an XML parser, a software developed to access and manipulate XML documents, the same underlying data can be used in multiple and very different presentation scenarios and the same XML document can be convert to a PDF, a Microsoft Word document and a web page as per the image below. + +| ![](/assets/content/assets/en/resources/hosted/text-encoding-and-the-tei/b28abb73c0ea8f07ef02ec04a0c3df7b56a581de.png) | +|---| + + + + + + + + + +Examples of XML languages + + + + +As XML is a metalanguage. It has no predefined tags. Rather, it provides a syntax for the development of vocabularies, including XHTML and SVG. These are two standards that have become important in web development and they highlight two different XML use cases: for the description of web pages and for the description of graphics. + +#### XHTML + +One of the best known XML standards is probably XHTML. XHTML is an XML vocabulary for the description of web pages. The basic structure of an XHTML document consists of an <html> root element and two nested child elements, <head> and <body>. The head section may contain metadata and links to external files (e.g. CSS and JavaScript files), while the body section contains the page content that is displayed by the web browser. The page content in the body section can be further structured using elements to identify divisions, <div>, headings, e.g. <h1>, <h2> and <h3>, paragraphs, <p>. Furthermore images and other media can be included with XML elements. + +| ``` 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 ``` | ``` "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">   <html xmlns="http://www.w3.org/1999/xhtml">   <head> <title>Title of the XHTML documenttitle> <meta property="dc:creator">Roman Bleiermeta> head> <body> <h1>The main headingh1> <p>a paragraphp> body>   html> ``` | +|---|---| + + + + + + +A crucial difference between XHTML and HTML is that because HTML is not an XML language it does not need to follow the XML rules (although it does follow SGML rules). For instance, in HTML5 not all elements need to be closed with a closing tag, while XHTML requires that every opening tag has a corresponding closing tag. For more information on XHTML and the differences between HTML and XHTML we recommend online tuturials such as the one provided by [W3schools](http://www.w3schools.com/html/html_xhtml.asp). + +#### SVG + +SVG is a standard developed by the W3C to describe vector graphics. SVG is an XML based language that uses the same basic syntax as other XML languages. Like all XML languages also SVG files can be opened with a simple XML editor and it contains human-readable data that is interpreted by an SVG processor as a graphic. For instance, the following statement describes a black circle with a radius r of 50: + +``` + +``` + + + + + + + +SVG documents are essentially a series of XML statements that describe graphic elements. These statements can easily be changed and rewritten with a programming language such as JavaScript. Therefore, SVG images are becoming increasingly popular in interactive web development and online gaming. W3Cschools has a gaming tutorial that uses SVG ([http://www.w3schools.com/games/](http://www.w3schools.com/games/)). However, SVG is also used in the print publishing industry for high-quality images for posters. + +If you are interested to learn more about SVG the [tutorial of W3school](http://www.w3schools.com/svg/) or the [Introduction to SVG](https://developer.mozilla.org/en-US/docs/Web/SVG/Tutorial/Introduction) by the Mozilla Developement Network are good places to start. + + + + + + + + +## Modelling with TEI + + + + + + + +The TEI schema + + + + +You have heard already about schemas and DTDs in the Modeling with XML section. Like every XML language TEI requires a schema that explains how elements and attributes are used in a TEI document. The TEI community provides a general schema that defines all TEI modules and attribute classes. This schema is called **tei\_all** and all TEI documents have to conform to this schema. The tei\_all schema exists in several different forms, for instance as a RELAX NG schema, a schema language developed for XML documents, or as DTD. The two schema versions can be found under the following URLs: + +``` +https://www.tei-c.org/release/xml/tei/custom/schema/dtd/tei_all.dtd +``` + +``` +https://www.tei-c.org/release/xml/tei/custom/schema/relaxng/tei_all.rng +``` + +The TEI community also provides subsets of tei\_all, because it can be very confusing and time consuming to choose from 500 TEI elements especially if most of them are not required for the encoding of your text. If you are encoding poetry you will probably not need the elements and attributes that are needed to encode dictionaries. Therefore, the TEI subsets are selections of modules and attribute classes that you might most likely need in your encoding project. TEI Lite customisation was developed to suit about 90% of TEI-based projects and it is used very often used as a starting point for further customisation. Additionally, schemas exist for Manuscript Description, Drama, etc. A list of custom schemas and description can be found on the following website: + +``` +http://www.tei-c.org/Guidelines/Customization/

+``` + + +DTDs and schemas are used as a technical specification of an TEI/XML model. TEI has also its own modeling language called ODD (One Document Does it all). An ODD contains schema fragments as well as prose descriptions of element and attribute usage and examples from the TEI guidelines. An ODD can be used to generate a schema to document the XML model and a prose documentation in HTML or PDF. ODD is an holistic way to document a TEI customisation and it is a requirement for every TEI project to provide a detailed ODD model. You will learn more about ODD in a later session. + +Further reading: + +Elena Pierazzo, Modelling (Digital) Texts, in Digital Scholarly Editing: Theories, Models and Methods, 2015, <[http://hal.univ-grenoble-alpes.fr/hal-01182162](http://hal.univ-grenoble-alpes.fr/hal-01182162)> + +Text Encoding Initiative, The TEI Infrastructure, <[http://www.tei-c.org/release/doc/tei-p5-doc/en/html/ST.html](http://www.tei-c.org/release/doc/tei-p5-doc/en/html/ST.html)> + +
+ +
+ + + + + +Why use TEI? + + + + +In the previous unit we have learned how one can model data using XML to represent and describe a specific type of data. We could thus create a model in XML for a poem as below: + +| ``` 1 2 3 4 5 6 ``` | ``` Mary had a little lamb, Its fleece was white as snow, And everywhere that Mary went The lamb was sure to go. ``` | +|---|---| + + + + + +In our model above the entire poem is represented using the <verse> element and individual lines of the poem using the <line> element. If we all use our own custom markup languages this can get confusing and makes interchangeability virtually impossible. Somebody else might use <poem> and <LN> to encode poems and somebody in Germany might use <Gedicht> for poem and <Zeile> for line. This need for a common vocabulary is not new. By the mid-1980s there was a clear need for a common format. Academics, librarians and archivists from North America and Europe met and developed what was to become the Text Encoding Initiative, which has become the de-facto standard for encoding texts in the humanities.The acronym TEI stands both for an encoding standard for electronic texts, Text Encoding for Interchange, and for the consortium that releases and continuously develops this standard, the Text Encoding Initiative. The TEI consortium was established in 1987 as an international research project to develop a standard to ‘facilitate the creation, exchange, and integration of textual data in machine-readable form’. The goal was to create a standard that would support the encoding of ‘all kinds of texts, in every human language, from every historical or social context’. A challenging goal! + +The main characteristics and benefits of the TEI markup language are that TEI was designed to encode meaning (descriptive markup language), to be software independent, and to be community-driven. The TEI recommendations are continuously updated and occasionally major releases are published. These major releases are numbered incrementally starting with TEI P1 (in 1990) to the latest release TEI P5 (in 2007). Since, 2011 TEI is also registered as its own media type (RFC 6129). + +Since the first draft of the TEI guidelines was released in the 1990s, TEI has developed into one of the most important encoding standards within the humanities. The first TEI guidelines P1 to P3 are based on SGML, while the more recent standards – TEI P4 ( June 2002) and TEI P5 ( November 2007) – use XML. + +Further reading: + +Lou Burnard, The Evolution of the Text Encoding Initiative: From Research Project to Research Infrastructure, in Journal of the Text Encoding Initiative, 2013, [http://jtei.revues.org/811](http://jtei.revues.org/811) + +Lou Burnard, What is the Text Encoding Initiative, 2014, [http://books.openedition.org/oep/426?lang=en + ](http://books.openedition.org/oep/426?lang=en)Nancy M. Ide and C. M. Sperberg-McQueen, The Text Encoding Initiative: Its History, Goals, and Future Development:[http://www.cs.vassar.edu/~ide/papers/teiHistory.pdf ](http://www.cs.vassar.edu/~ide/papers/teiHistory.pdf) + + + + + + + + + +TEI Guidelines + + + + +The TEI Guidelines define and document the standard for electronic Text Encoding for Interchange (TEI). The Guidelines describe what TEI/XML elements and attributes are allowed and how they should be used. The Guidelines contain a declaration and description of each TEI element, code examples and several thematic chapters that explain how TEI elements and attributes should be used. +TEI is a language that was developed for modelling of various texts in the humanities. Therefore, TEI does not promote one model of a text, but is flexible enough to allow for a researcher to chose or create a model that suits her or his research needs. TEI has over 500 predefined elements organised in modules. Each module and the associated elements are described in the Guidelines. A TEI module groups together associated TEI elements such as the TEI elements recommended for the encoding of drama or dictionaries. There are also more general TEI modules which contain 'core' and 'header' elements, basic elements most likely to be used in all TEI documents. A full list of modules from the TEI guidelines: + +| module name | description | +|---|---| +| analysis | Simple analytic mechanisms | +| certainty | Certainty and uncertainty | +| core | Elements common to all TEI documents | +| corpus | Header extensions for corpus texts | +| declarefs | Feature system declarations | +| dictionaries | Dictionaries and other lexical resources | +| drama | Performance texts | +| figures | Tables, formulae, and figures | +| gaiji | Character and glyph documentation | +| header | The TEI Header | +| iso-fs | Feature structures | +| linking | Linking, segmentation and alignment | +| msdescription | Manuscript Description | +| namesdates | Names and dates | +| nets | Graphs, networks and trees | +| spoken | Transcribed Speech | +| tagdocs | Documentation of TEI modules | +| tei | Declarations for datatypes, classes, and macros available to all TEI modules | +| textcrit | Text criticism | +| textstructure | Default text structure | +| transcr | Transcription of primary sources | +| verse | Verse structures | + + + + + + + +Besides modules, the TEI elements and attributes are also organised in model classes and attribute classes. The model classes group elements together based on the location they are appearing. For instance, the model 'nameLike' groups elements that can be used to tag various names such as person name, place name, organisation name. A full list of model classes can be found as [Appendix A](http://www.tei-c.org/release/doc/tei-p5-doc/en/html/REF-CLASSES-MODEL.html) of the Guidelines. Another important building block of TEI/XML documents are attributes. Attributes are used to store additional information about an element and its content. In the TEI attributes are grouped together in attribute classes. One of the most important attribute classes is the 'global' class. It groups together TEI attributes that can be used on all TEI elements such as the attribute @xml:id (used for an identifier) or @n (used for a number or label). Some classes have also subclasses. For instance, the 'global' class has a subclass 'global.rendition'. This subclass contains attributes that describe rendition and styling of an encoded textual feature. The attribute classes are listed and documented in the TEI guidelines in [Appendix B](http://www.tei-c.org/release/doc/tei-p5-doc/en/html/REF-CLASSES-ATTS.html). + +Knowledge about TEI modules, model classes and attribute classes is essential for customisations. Customisations may be necessary in order to create a TEI model that is a close representation of a project-specific understanding of text. + + + + + +
+ + +# Unit II: Modelling with XML and DTDs + +## Basic XML rules: well-formed and valid XML + + + + + + + +Well-formed vs valid XML + + + + +XML documents are essentially text documents containing data and markup. Data can be text or numeric. Markup is the tags used to add structure and semantic information. XML processing software expects basic rules to be followed that distinguishes data from markup. If you are familiar with HTML, XML will seem quite familiar as it uses a similar markup system. + +All XML documents have to conform to a set of rules that were developed by the World Wide Web Consortium (W3C) a community of organisations that develop web standards for core languages of the WWW (including HTML, CSS and XML). The rules for XML documents are described in the XML specification (https://www.w3.org/XML/). XML processing software expects XML documents to be structured following these rules, and any divergence causes what is known as a parsing or syntax error. + +[W3Schools ](http://www.w3schools.com/xml/xml_validator.asp)identifies five key rules for XML syntax. Documents that conform to these rules are called *Well-Formed*: + + +1. XML documents must have a root element +2. XML elements must have a closing tag +3. XML tags are case sensitive +4. XML elements must be properly nested +5. XML attribute values must be quoted + +*Valid* XML, on the other hand, is a document that has additional constraints. In addition to being Well Formed, a Valid XML document adheres to the a set of rules as defined by a Document Type Definition (DTD) or a Schema. A Parsing Editor, software that understands XML syntax, will check your documents for conformity in terms of the rules of Well-Formedness, and if your document calls a DTD or Schema, the parsing editor will also check that your document abides by additional constraints, which include element and attribute names, where in the document elements and attributes can be used, and how frequently elements can be used. + + + + + + + + + +Quiz: XML names + + + + +Each element in the quiz below is not well-formed XML, but for different reasons. Choose the correct reason for its being invalid XML. + + + +Right answer! + + + +Wrong answer! + + + +<newElement> </Newelement> + + + +XML elements are case sensitive + + + +XML names may contain characters, digits, underscore and dots, but cannot contain ampersands. + + + +An XML element name cannot start with the string 'xml' or 'XML'. + + + +An XML name cannot contain whitespace + + + + + + +Right answer! + + + +Wrong answer! + + + +<my.new_Element&attribute></my.new_Element& attribute> + + + +XML names may contain characters, digits, underscore and dots, but cannot contain ampersands. + + + +XML elements are case sensitive + + + +An XML element name cannot start with the string 'xml' or 'XML'. + + + +An XML name cannot contain whitespace + + + + + + +Right answer! + + + +Wrong answer! + + + +<xmlElement></xmlElement> + + + +An XML element name cannot start with the string 'xml' or 'XML'. + + + +XML elements are case sensitive + + + +XML names may contain characters, digits, underscore and dots, but cannot contain ampersands. + + + +An XML name cannot contain whitespace + + + + + + +Right answer! + + + +Wrong answer! + + + +<new Element> </new Element> + + + +An XML name cannot contain whitespace + + + +XML elements are case sensitive + + + +XML names may contain characters, digits, underscore and dots, but cannot contain ampersands. + + + +An XML element name cannot start with the string 'xml' or 'XML'. + + + + + + + + + + + + + + +Building-blocks of XML documents + + + + +#### Elements + +Within an XML document data is contained within elements. Elements have a start-tag and an end-tag. Start- and end-tags consist of an **element name,** which is a string of text such as ‘PersonName’, and a delimiter indicating the beginning and end of a tag. XML tags are delimited from the stored data by use of angle brackets (or inequality signs). The bracket < is used to indicate the beginning of an XML start-tag and > is used to indicate the end of a start-tag. + +The end-tag starts with the delimiter </ and ends with the angle bracket >. The following would be an example of an XML element describing a person name. The content of the element, or in XML parlance, PCDATA (Parsed Character Data) 'Clark Kent' appears between the start- and end-tags: + +| ``` 1 ``` | ``` Clark Kent ``` | +|---|---| + + + + + + +A special case are so-called **empty elements**. Empty elements do not contain PCDATA, and one tag functions as both the start and end tag. Empty elements are typically used as milestones. The XHTML <br /> which indicates line breaks would be typical of this. This structure is frequently necessary to indicate where breaks occur in a source text (e.g. a page or section break in a book or an article). + +In the example of our element <PersonName> the element name is a string of characters. The rules for XML element names are that names must start with a letter or underscore, they can contain letters, digits, hyphens, underscores, and periods and they are case-sensitive. The following table shows a examples of valid XML names: + +| **Correct XML element names** | +|---| +| <\_newElement> </newElement> | Element names must start with a letter or underscore | +| <newElement> </newElement> | Element names are case-sensitive, start and end tag have to match | +| <my.new\_Element-1></my.new\_Element-1> | Element names can contain letters, digits, hyphens, underscores, and periods | + +Incorrect construction of element names leads to XML data that is not Well Formed. An XML processor will not process the data, but instead indicate an error message. The following table shows examples of illegal XML element names: + +| **Illegal XML element names** | +|---| +| <1Element> </1Element> | Element names cannot start with a digit, hyphen or period | +| < Element /> </ Element> | Element names cannot start with an initial space | +| <newElement> </Newelement> | Element names are case-sensitive and start and end tag must match | +| <xmlElement></xmlElement> | Element names cannot start with the string xml, XML, Xml, etc | +| <new Element> </new Element> | Element names cannot contain whitespace | + +#### Attributes + +Another building block of XML documents are attributes. Attributes are a way to add additional information in the form of name-value pairs to an XML element. Attributes modify, refine, or further delineate elements. If elements are thought of as nouns, attributes can be likened to adjectives. Attributes have to be placed after the element name of the start tag and they are separated by a whitespace from the element name. In the following example the attributes @first-name and @last-name with values are added to the XML element person: + +``` + +``` + + + + + + + + +Attributes can also be used to store data and sometimes it is difficult to decide if data should be stored as an attribute value or within an element. A clear benefit for storing information within an element is that data can be structured further by nesting other XML elements or with attributes. This cannot be done with an attribute value. The benefit of attributes is that they can be useful to describe an element and its content further and with schemas ,the data that is stored as attribute values can be restricted to specific data types and values. + +For attributes, there are similar naming rules to elements. Names must start with a letter or underscore, they can contain letters, digits, hyphens, underscores, and periods, but cannot contain whitespace. Attribute values have to be quoted, they can contain alphanumeric characters, whitespace and various other characters such as period, hyphen, underscore, comma, etc. However, you have to be careful using single and double quotes. If single quotes are used as attribute value delimiter, they are not allowed in the value string. If double quotes are used as attribute value delimiter, they are not allowed in the value string. The following table contains examples of valid use of attributes. + +| **Valid use of attributes in XML** | +|---| +| <newElement attribute1=”attribute value: 1” /> | Attribute name may contain digits, but cannot start with a digit. Attribute values may contain whitespaces, punctuation and alphanumeric characters in any order. | +| <person name=Rob Miller’ /> <person name=”Rob Miller” /> | Single quotes or double quotes can be used as delimiter of attribute values. | +| <address owner=”Mary’s address” /> | Single quotes can be used within a value string, but only if they are not deliminators. | +| <sentence spoken=’He said: “go”!’ /> | Double quotes can be used within a value string, but only if they are not deliminators. | + + +Incorrect attribute syntax leads to XML data that is not Well-Formed. An XML processor will not process the data, but instead indicate that there is an error. The following table shows examples of illegal attribute names and values: + +| **Illegal use of attribute names and values in XML** | +|---| +| <person name=Robert /> | Attribute values must be quoted. | +| <person name=”Robert’ /> | No mismatch between the quote delimiters. | +| <sentence spoken=”He said: “go”!” /> | If double quotes are used as deliminators of a value string, they cannot be used in the value string itself. | +| <address owner=’Mary‘s address’ /> | If single quotes are used as deliminators of a value string, they cannot be used in the value string itself. | +| <person first name=”Frank” /> | An attribute name cannot contain whitespace characters. | +| <person 1stname=”Robert” /> | Attribute names must start with a letter or underscore | +| <person name=”Robert” name=”James” /> | An element cannot have multiple attributes with the same name. | + + +Elements and attributes are the main building blocks of XML and they are essential for structuring and modelling data. Other important components that are present in most XML documents are *Processing instructions* and XML and Unicode entities. + +##### +Processing instructions + +Processing instructions contain instructions for an XML processor specifying what version of XML is used, what character encoding and/or what schema should be used to validate the XML document. Processing instructions can be easily recognised because they are written in the first lines of an XML document. Processing instruction do not have a start- and end-tags. However, they do have attributes to store information. For instance, the following processing instruction is the XML declaration and should be at the beginning of every XML document. It declares that this document is encoded according the XML standard using XML version 1.0 and the character encoding UTF-8: + +| ``` 1 ``` | ``` ``` | +|---|---| + + + + + +##### Unicode entities + +The text stored in an XML element is usually PCDATA (Parsed Character Data). This is a data definition used in XML documents and specifies that the five characters that are used to distinguish mark-up from data, such as angle brackets (for elements), single and double quotes (for attributes) and ampersand (for named entities), have to be *escaped* using Entity references. Entity references begin with & and end with a semicolon. The following list shows what Entities have to be used instead of the five illegal characters: + +| Character | XML entity | +|---|---| +| < | &lt; | +| > | &gt; | +| & | &amp; | +| ' | &apos; | +| " | &quote; | + +Another form of Entity references are *Character* references for characters or symbols not contained in the ASCII character set. For instance, Unicode character references can be used within an XML document. Such character references start with &# and end with a semicolon and are directly embedded into an XML document. For instance, the Greek Capital Letter Pi has the character reference: &#x03A0; + + + + + + + + + +Nesting elements + + + + +The XML specifications require that within an XML document all elements have to be nested within a root element. If you take the simple example of an address book, the root element could be <addressBook>. Nested within the root element you may have several <entry> elements containing address information. . A very basic structure might look something like this: + +| ``` 1 2 3 4 5 6 7 8 ``` | ``` This is the first entry! This is the second entry! ``` | +|---|---| + + + + + +To explain basic principle of nesting in XML, the Russian ‘Matryoshka doll’ might make a good analogy. The root element is the largest container. All other elements have to be entirely nested within the root element or another element. In a similar way smaller Matryoshka dolls’are nested within bigger ones. + + +![](/assets/content/assets/en/resources/hosted/text-encoding-and-the-tei/15cf90b368ddf75e0e850b1b5e1eceb7cdf44bfb.jpg)In the address book example, all <entry> elements are completely nested within the start and end root element. Furthermore, each address entry can other have elements nested in it. For instance, an entry may have <name> as follows: + +| ``` 1 2 3 4 5 ``` | ``` ``` | +|---|---| + + + + + + +Each entry could have additional elements nested inside it, including house number, street name, town, telephone number, etc. Nesting elements provides additional structure which makes it easier for searching . For instance, consider the following address: + +> Clark Kent 344 Clinton Street Metropolis 55 50145 + +As humans we can read each part of the entry and understand what part is the name, the address and the telephone number. A computer does not understand text in the same way as humans do, hence adding more structure provides for this specificity: + +| ``` 1 2 3 4 5 ``` | ``` Clark Kent
344 Clinton Street Metropolis
55 50145
``` | +|---|---| + + + + + + + + With more elements, even more data can be specified as follows: + +| ``` 1 2 3 4 5 6 7 8 9 10 11 12 ``` | ``` Clark Kent
344 Clinton Street Metropolis
55 50145
``` | +|---|---| + + + + + +That this data is nested provides important information to the computer. For example, since the <address> element encloses <houseNr>, <street> and <town>, it can be inferred that they belong to the same address. + +
+ +
+ + + + + +Element relationships within an XML document + + + + +Conceptually, the structure of an XML document is a tree structure. The *root element*, forms the basis of the XML tree and all other element nodes, attribute nodes and text nodes reach out like braches and leaves. Visually this could be represented as a tree graph as in the following figure of the address book in the previous section: + +| ![](/assets/content/assets/en/resources/hosted/text-encoding-and-the-tei/f7fad3bcd13b476edb7bf474b6623f5448753a38.png) | +|---| + +Moreover, various relationships between elements exist. In the **child-parent relationship** one element (the child) is nested in another element (the parent). In the following example the element <name> is nested in the element <entry> and consequently <name> is the child of <entry> and <entry> the parent of <child>. + +| ``` 1 2 3 ``` | ``` Clark Kent ``` | +|---|---| + + + + + + +A **sibling relationship** means that elements (the siblings) have the same parent element. For instance, in our address example the elements <name>, <address> and <phoneNr> have the same parent <entry> and are hence siblings to each other: + +| ``` 1 2 3 4 5 ``` | ``` Clark Kent
344 Clinton Street Metropolis
55 50145
``` | +|---|---| + + + + + +These relationships provide powerful structure for other software, as well as other XML-standards, such as XPath and XQuery to navigate the tree structure for formatting, for searching, and for extracting information from the XML document instance. + +
+ +
+ + + + + +Quiz: Attributes + + + + +Why would you use attributes? + + + +Why would you use attributes? + + + +To provide additional descriptive information about the element content. + + + +To add a comment to your XML documents. + + + +To explain to the XML processor how an element should be processed and how its content should be displayed. + + + +To provide store information that has nothing to do with the content of the element the attribute belongs to. + + + +correct + + + +incorrect + + + + + + + + + + + + + + +Quiz: Root element + + + + +Which one of the following statements is correct? + + + +Which one of the following statements is correct? + + + +A root element does not require a closing end tag. + + + +A root element encloses the entire XML of an XML document. + + + +A root element has to be written in capital letters. + + + +You can't place any attributes on a root element. + + + +correct + + + +incorrect + + + + + + + + + + + + + + +Quiz: XML structure + + + + +How is the structure of an XML document often represented? + + + +How is the structure of an XML document often represented? + + + +As a tree structure. + + + +As a list structure. + + + +As a circle structure. + + + +As a flower structure. + + + +correct + + + +incorrect + + + + + + + + + + + + + + +Quiz: Valid XML + + + + +What are the requirements for valid XML? Which one of the following statements is NOT correct? + + + +What are the requirements for valid XML? Which one of the following statements is NOT correct? + + + +A valid XML document is also a well-formed XML document. + + + +A valid XML does not need to conform to the XML syntax. + + + +A valid XML should not have a DTD or schema. + + + +A well-formed XML document is also a valid XML document. + + + +correct + + + +incorrect + + + + + + + + + + +
+ + +## Introduction to DTDs + + + + + + + +Introduction to DTDs + + + + +As discussed in the previous section, to be a well-formed XML document a set of rules must be adhered to. Many documents also conform to another set of rules that constrain what, how, and the frequency of elements and attributes. An XML document that is well-formed and conforms to a schema or DTD is called a *Valid* XML document. + +This unit will teach the basic principles of one of these rule sets: Document Type Definition (DTD). DTDs were originally developed to work with SGML and do not use SGML or XML syntax. Every SGML document instance was required to have a DTD. XML is more flexible and does not require documents to conform to a DTD or schema (i.e. they are well formed but not valid). Schema languages include XML Schema, RNG Schema or Schematon. However, the underlying modelling principles are very similar and we teach DTDs here because it is a simpler syntax to learn. The TEI P4 guidelines had its own chapter on the use of DTDs \[[TEI P4 Documentation, Chapter. 3](http://www.tei-c.org/Vault/P5/1.0.1/doc/tei-p4-doc/html/ST.html)\]. + +A DTD is a series of statements that can be written either at the beginning of an XML document (an internal DTD right after the XML declaration, or in a separate file from the XML document (an external DTD). The main advantage of an external DTD is that the rules defined in an external DTDs can be applied to many XML documents (called, in this case, document instances). This facilitates consistency of encoding and maintenance of the DTD. For the following examples and exercise we will use internal DTDs, because we are only working with one XML document. However, the same DTD statements would also apply to external DTDs. + +| | +|---| + +#### Further reading + +W3schools, DTD Tutorial, [http://www.w3schools.com/xml/xml\_dtd\_intro.asp](http://www.w3schools.com/xml/xml_dtd_intro.asp) + + + + + + + + + +Quiz: Why schemas or DTD? + + + + +Why use schemas or DTDs? + + + +Why use schemas or DTDs? + + + +DTD and schema statements allow you to style your content (e.g. font-size, color, etc.). + + + +DTDs and schemas are used to define syntax and structure of XML languages. + + + +DTDs and schemas are only used to add comments to your XML document. + + + +DTDs and schemas are required for XML documents. + + + +correct + + + +incorrect + + + + + + + + + + + + + + +Creating Elements + + + + +In this unit you will learn to develop a DTD for a simple address book. An address book usually contains several entries and each entry will contain information about a person including a name, address, and telephone number such as in the following table: + +| Name | Address | Telephone number | +|---|---|---| +| Clark Kent | 344 Clinton Street Metropolis | 55 50145 | +| Bruce Wayne | 1007 Mountain Drive Gotham | 53 59333 | +| | | | + +As was mentioned previously, every XML document must have exactly one root element which encloses all other elements. The root element, like every other element, must be declared in the DTD. Root elements should be descriptive of the content. So, in our case, we will use the root element <addressBook>. The following statement declares that <addressBook> is the root element: + +``` + + +]> +``` + + + + + + + + + +Between the square brackets (in blue) is the declaration of the root element. The DTD element declaration starts always with <! and ends with the ‘greater than symbol’, >.The definition within the round brackets means that the content of the element <addressBook> can only be of type "#PCDATA". PCDATA, or Parsed Character Data, is a data definition used in XML documents and basically means plain text with a few constraints. For instance, an important restriction of PCDATA is that characters such as ampersand angle brackets (< or >) and single and double quotes have to be escaped because these characters are used to distinguish mark-up from data. + +The following video shows how a DTD should be placed at the beginning of an XML document and how a text editor such as oXygen can be used to evaluate if an XML is well-formed and valid. + +| | +|---| + +The above DTD statement specifies an addressBook element that can only store character data such as: + +| ``` 1 ``` | ``` Clark Kent 344 Clinton Street Metropolis 55 50145 Bruce Wayne 1007 Mountain Drive Gotham 53 59333 ``` | +|---|---| + + + + + + +Within the <addressBook> element currently only text is allowed. If more than one address entry, it would soon become confusing and difficult to see where one entry ends and the next one starts. Therefore, another element is needed to separate individual entries. Building on our previous example, we will add an <entry> element so that individual addresses can be separated: + +``` + + + +]> +``` + + + + + + + + +Not only did we declare a second element, we added in additional constraints to the <addressBook> element. In the first example, <addressBook> coud only contain PCDATA (i.e. plain text). But now it cannot contain PCDATA but must only contain the element <entry>. Moreover, the asterix \* after the element name specifies that ‘zero or more’ <entry> elements are contained within <addressBook>. Now each address entry can be encoded as an individual <entry> element as below: + +| ``` 1 2 3 4 ``` | ``` Clark Kent 344 Clinton Street Metropolis 55 50145 Bruce Wayne 1007 Mountain Drive Gotham 53 59333 ``` | +|---|---| + + + + + + + +#### Nesting Elements + + + +``` + + +]> +``` + + + + + + + + +The above DTD statement specifies an addressBook element that can only store character data such as text and numbers: + +| ``` 1 ``` | ``` Clark Kent 344 Clinton Street Metropolis 55 50145 Bruce Wayne 1007 Mountain Drive Gotham 53 59333 ``` | +|---|---| + + + + + + +Within the <addressBook> element currently only text is allowed. An address book has usually more address entries. In order to tell the computer where one address starts and another ends an element is needed to separate individual entries. Building on our previous example, we will add an <entry> element so that individual addresses can be separated: + +``` + + + +]> +``` + + + + + + + + +Not only did we declare a second element, we added additional constraints to the <addressBook> element. The <addressBook> element cannot contain PCDATA (text and numbers) anymore. Now it can only contain the element <entry>. Moreover, the asterix \* after the element name specifies that ‘zero or more’ <entry> elements are contained within <addressBook>. Now each address entry can be encoded as an individual <entry> element as below: + +| ``` 1 2 3 4 ``` | ``` Clark Kent 344 Clinton Street Metropolis 55 50145 Bruce Wayne 1007 Mountain Drive Gotham 53 59333 ``` | +|---|---| + + + + + + +Within entry further elements could be nested such as elements for name or street or telephone number. How would you define the elements <name>, <street> and <phoneNr> with DTD? They should all be children of <entry> and contain only PCDATA content. + + + + + + + + + +DTD Exercise I + + + + +In the first part of this exercise, we will create a simple DTD for poetry. In the first part of the exercise, only use the elements below: + +- poem (which will be the root element) +- verse +- line +- title +- note +- author + +To help you with the DTD syntax, we’ve created a [DTD Cheat Sheet](/assets/content/assets/en/resources/hosted/text-encoding-and-the-tei/bd6f7e8302f0f16e8890f2d18ceda3a7b0aef5b3.jpg). Open up the PDF and keep it handy when you are creating the DTD. Create the DTD in a parsing editor such as oXygen. oXygen understands DTD syntax, and once you begin creating the document instance, it will give error messages if you do something wrong. However, until the XML code is in place, you will get parsing errors. + +Start by opening oXygen, and then choose the top left icon, the new document icon, and then choose 'XML Document'. Begin by creating the root element right below the XML declaration. + +| ``` 1 2 3 ``` | ``` ]> ``` | +|---|---| + + + + + +Save your file. Create your DTD using the elements above using the syntax provided in this section. Remember, the element declarations go in between the square \[\] brackets. After you create your DTD, see if the DTD will model your poem the way you expect. To do this, create a document instance right below the DTD. Once everything works correctly, go back and add occurrence indicators to further define the behavior of the elements. + +You can try to create the DTD on your own, or follow along with this video + + + + + + + + + + + +Defining different types of elements + + + + +DTD’s allow for the definition of different types of Elements. It can be specified that an element should contain **text content, element content, mixed content or no content**. Earlier we have already seen how **text content** is defined using the string ‘#PCDATA’ and how elements can be added to our <addressBook>. By replacing the string ‘#PCDATA’ with the name of the element <entry>. This specifies that the element <addressBook> can only contain ‘entry’ elements’ (**element content**). + +Next, we structure our entries even further by providing a more descriptive model for address entries: + +``` +









]> +``` + +The above DTD extends the previous DTD. The content model of <entry> was changed from #PCDATA to allow <name>, <address> and <phoneNr>. The new DTD requires that each entry element has exactly one <name> element, one <address> element and one <phoneNr> element. + +| ``` 1 2 3 4 5 6 7 8 9 10 11 12 ``` | ``` Clark Kent
344 Clinton Street Metropolis
55 50145
Bruce Wayne
1007 Mountain Drive Gotham
53 59333
``` | +|---|---| + + + + + +**Mixed element content** is a combination of text and element content. For instance, within the element <phoneNr> it might be necessary to tag the country calling code with an element <countryCode>, but leave the rest of the telephone number as text. Such mixed element content can be defined in the following way: + +``` + +``` + +``` + +``` + + +The above DTD will allow us to identify the country code in the following way: + +| ``` 1 2 3 4 ``` | ``` 53 59333 ``` | +|---|---| + + + + + + +Another form of element content are **empty elements**, elements that contain no content. In HTML the <br /> element indicating a line break and <hr /> element indicating a horizontal row. These are examples of empty elements. The characteristic of empty elements is that they do not contain data, text or child elements and the element is closed immediately. With DTD empty elements can be defined with the keyword ‘EMPTY’. For instance, the following statement defines an empty element named <relationship>: + +``` + +``` + +
+ +
+ + + + + +Creating Attributes + + + + +In the previous unit we learned the basic syntax of the DTD language to define a set of rules for our XML documents. We saw how rules for element names and element content can be defined. Now we will have a look at how attributes and attribute values can be defined in DTDs. + +In the last unit we built a DTD for the following address book XML: + +| ``` 1 2 3 4 5 6 7 8 9 10 11 12 13 14 ``` | ``` Clark Kent
344 Clinton Street Metropolis
55 50145
Bruce Wayne
1007 Mountain Drive Gotham
53 59333
``` | +|---|---| + + + + + + +The element ‘relationship’ is an empty element containing no data and is therefore not meaningful yet. We will learn now how to add a ‘type’ attribute that contains information about the type of relationship. The DTD code that defines the above XML is the following: + + + + +| ``` 1 2 3 4 5 6 7 ``` | ``` ]> ``` | +|---|---| + + + + + +The following DTD statement defines an attribute named ‘type’ for the element ‘relationship’. + +| ``` 1 ``` | ``` ``` | +|---|---| + + + + + +The components of this statement are: + +| **<!** | Start of attribute definition | +|---|---| +| **ATTLIST** | declares an attribute list | +| **relationship** | declares the element to which the attribute belongs | +| **type** | declares the attribute name | +| **CDATA** | defines the content type of the attribute, CDATA stands for ‘character data’ | +| **\#REQUIRED** | declares that this attribute is mandatory on every ‘relationship’ element | +| **>** | End of attribute definition | + +DTDs allow to specify different content types for attributes. Besides character data (CDATA) the values … are possible. + +For some attributes it is useful to specify predefined values. This can be done by adding the predefined values in round brackets after the attribute name instead of the CDATA in the example above: + +| ``` 1 ``` | ``` ``` | +|---|---| + + + + + +One attribute list can contain several attributes of an element. For instance, the following statements declares the attributes ‘first’, ‘second’ and ‘title’ on the name element: + +| ``` 1 2 ``` | ``` ``` | +|---|---| + + + + + + All attributes that we defined here are mandatory as indicated by the #REQUIRED statement. In order to make an attribute optional #REQUIRED has to be replaced with #IMPLIED. As in the example of the ‘title’ attribute here: + +| | ```
title CDATA #IMPLIED> ``` | +|---|---| + +
+ +
+ + + + + +Quiz: Placing DTD + + + + +Where would you place a DTD? + + + +Where would you place a DTD? + + + +A DTD should be placed at the beginning of an XML document or in an external file. + + + +At the end of an XML document. + + + +Right after the root element. + + + +It is not allowed to place DTDs in an XML document. + + + +correct + + + +incorrect + + + + + + + + + + + + + + +DTD Exercise II: Adding Attributes + + + + +Now we will use the DTD created earlier for poems, and add two attribute values. + +1\) add the attribute n (for number) on the line and verse elements + +2\) add the attribute type on the note element + +This is the syntax you can use for the number attribute + +``` + +``` + +Be sure to create the statement directly after the line element statement. You can add the attributes on your own, or do it while watching the video below. + + + + + + + +
+ + +## Advanced DTD exercise + + + + + + + +Advanced DTD exercise + + + + +#### Recipe exercise + + +On the internet you can find a great number of cooking websites with recipes. Most cooking recipes have a similar content types and a similar structure. For this exercise look up online recipes websites and try to find a model to express recipes. + +- Identify at least five things that you would like to tag in recipes and think about suitable element and/or attribute names +- Open up your XML editor and create a new XML document called recipe.xml +- Copy and paste the text from one of the recipes that you found online into recipe.xml +- Try to tag the different parts of the recipe with your element and attributes. +- Make sure that the XML document is well-formed by checking well-formedness +- Write a DTD model for your recipe and validate it against your XML + + + + + + + + + +Challenge exercise + + + + +#### Cookbook exercise + +Based on the previous exercise you should create an online cookbook now. Starting from the recipe model, what elements and attributes do you need to create a model of a cookbook? + +- In your XML editor create a new XML document: cookbook.xml +- Copy and paste the DTD and XML from your recipe.xml +- Copy and paste the content for two other recipes into your new cookbook.xml +- Extend your DTD model to include additional elements that you think are required for a cookbook +- Add these additional elements to your cookbook XML and validate your XML document + + + + + + + + +# Unit III: How to encode with the TEI + +## TEI in practice + + + + + + + +The TEI header + + + + +### TEI Basics + +A TEI document consists of two main sections: the <teiHeader> and <text>. These two sections are child elements of the <TEI> root element. While the <text> element contains the document text (the encoded poem, letter or other textual object), the <teiHeader> provides for an extensive metadata record of the object, both the original analogue object (if appropriate) as well as the encoded version. A schematic of the basic structure is below in figure 1. + +| ``` 1 2 3 4 5 6 7 8 ``` | ``` ``` | +|---|---| + + + + + +Figure 1: a schematic of the basic structure of the TEI. + +### Structure of the TEI Header + +The main function of the <teiHeader> is to provide bibliographic record about the electronic document. It includes four main sections (or child elements), not all of which are required for conformant TEI: + +- <fileDesc>; a bibliographic record of the electronic text as well as the source from which the electronic text is derived; +- <encodingDesc>: documentation of the encoding and editorial principles used in tagging the electronic text; +- <profileDesc>: terms for indexing, searching and retrieval; +- <revisionDesc>: a record of changes made to the electronic document. + +Figure 2. shows the order of these elements, if they are all used. Although the only required child element is <fileDesc>. + +| ``` 1 2 3 4 5 6 ``` | ``` ``` | +|---|---| + + + + + +Figure 2. The child elements of <teiHeader> and their required order. + +### <fileDesc> in more detail + +<fileDesc> is the only child element of the <teiHeader> that is mandatory in all TEI documents. <fileDesc>, in turn, must include three child elements to be conformant: <titleStmt>, <publicationStmt> and <sourceDesc>. + +- <titleStmt> contains child elements that provides basic metadata about the document, including title of the resource, author and/or editor names, as well as the names and roles of other people who contributed to the creation of the electronic document; +- <publicationStmt> contains basic child elements regarding publication information of the electronic text, including publlisher name and address, copyright information, and publication date; +- <sourceDesc> contains child elements that describe the original source from which the electronic text was created. For instance, it may contain a detailed description of a manuscript or book. + +Figure 3. shows a basic TEI header structure with all mandatory elements: + +| ``` 1 2 3 4 5 6 7 8 9 10 11 12 13 ``` | ``` Title

Publication Information

Information about the source

``` | +|---|---| + + + + + +Figure 3: Mandatory TEI elements. + +Besides the three mandatory child elements of <fileDesc> described above, there are also four optional elements, the order of which are mandated by the Guidelines: + +- <editionStmt>: groups information relating to one edition of a text; +- <extent>: describes the approximate size of a text stored on some carrier medium or of some other object, digital or non-digital, specified in any convenient units; +- <seriesStmt>: groups information about the series, if any, to which a publication belongs; +- <noteStmt>: collects together any notes providing information about a text additional to that recorded in other parts of the bibliographic description. + + +### Options for encoding TEI header elements + +As described above, many of the elements contained in the <teiHeader> contain child elements that provide further structuring of the content. With many of these elements an editor can choose between a simple prose description or a more detailed structuring of the content using additional child elements. <encodingDesc> is a case in point. One option is to use a simple <p> tag for a prose description as in figure 4: + +| ``` 1 2 3 ``` | ```

Original spelling and typography is retained, except that long s and ligatured forms are not encoded.

``` | +|---|---| + + + + + +Figure 4. A simple prose description of <encodingDesc> However, the <encodingDesc> could encoding to provide much more structure via child elements such as figure 5: + +| ``` 1 2 3 4 5 6 ``` | ``` describes the overall project purpose and process documents rational for text sampling or selection in case of parts of the text or corpus have been omitted explains editorial principles of encoding or transcribing texts provides information about nonstandard characters and glyphs ``` | +|---|---| + + + + + +Figure 5. A detailed encoding of <encodingDesc>. Both types of encoding are absolutely correct. The decision as to which one to use will depend on the goals of the project the the purpose of the encoded texts. For example, if it is important for users to be able to search on how nonstandard characters were handled in the encoding, then the second example would be more suitable as the use of the <charDecl> would allow that element to be isolated for search purposes. + +Chapter Two of the TEI Guidelines explains the <teiHeader> and all the child elements that can be used. The exercises in this unit also go into the Header in more detail. + +### +Further reading + +The TEI header. The TEI Guidelines. <[http://www.tei-c.org/release/doc/tei-p5-doc/en/html/HD.html](http://www.tei-c.org/release/doc/tei-p5-doc/en/html/HD.html)> + +Module 2: The TEI header, TEI by Example. <[http://teibyexample.org/modules/TBED02v00.htm](http://teibyexample.org/modules/TBED02v00.htm)> + +
+ +
+ + + + + +The TEI text + + + + +### Introduction + +While the <teiHeader> is the container element for metadata, <text> is the container element for the content. <text> contains three major subdivisions or child elements: + +1. <front>: (front matter) contains any prefatory matter (headers, abstracts, title page, prefaces, dedications, etc.) found at the start of a document, before the main body +2. <body>: (text body) contains the whole body of a single unitary text, excluding any front or back matter +3. <back>: (back matter) contains any appendixes, etc. following the main part of a text. + +### Front + +Front matter that is typical in published texts, particularly on the title page, or *preliminaries* of older printed books, is encoded within the <front> element as in figure 1: + +| ``` 1 2 3 4 5 6 7 8 9 10 ``` | ``` The Hobbit or There and Back Again by J. R. R. Talkien illustrations by the author Boston and New York Houghton Mifflin Company 1938 ``` | +|---|---| + + + + + +Figure 1: Example of front matter + +Encoding of front matter is described in section [4.6 Title Pages ](http://www.tei-c.org/release/doc/tei-p5-doc/en/html/DS.html#DSTITL "Title Pages")of the TEI Guidelines. <front> contains a number of child elements including + +- [titlePage](http://www.tei-c.org/release/doc/tei-p5-doc/en/html/ref-titlePage.html) (title page) contains the title page of a text, appearing within the front or back matter. +- [docTitle](http://www.tei-c.org/release/doc/tei-p5-doc/en/html/ref-docTitle.html) (document title) contains the title of a document, including all its constituents, as given on a title page. +- [titlePart](http://www.tei-c.org/release/doc/tei-p5-doc/en/html/ref-titlePart.html) contains a subsection or division of the title of a work, as indicated on a title page. +- [argument](http://www.tei-c.org/release/doc/tei-p5-doc/en/html/ref-argument.html) contains a formal list or prose description of the topics addressed by a subdivision of a text. +- [byline](http://www.tei-c.org/release/doc/tei-p5-doc/en/html/ref-byline.html) contains the primary statement of responsibility given for a work on its title page or at the head or end of the work. +- [docAuthor](http://www.tei-c.org/release/doc/tei-p5-doc/en/html/ref-docAuthor.html) (document author) contains the name of the author of the document, as given on the title page (often but not always contained in a byline). +- [epigraph](http://www.tei-c.org/release/doc/tei-p5-doc/en/html/ref-epigraph.html) contains a quotation, anonymous or attributed, appearing at the start or end of a section or on a title page. +- [imprimatur](http://www.tei-c.org/release/doc/tei-p5-doc/en/html/ref-imprimatur.html) contains a formal statement authorizing the publication of a work, sometimes required to appear on a title page or its verso. +- [docEdition](http://www.tei-c.org/release/doc/tei-p5-doc/en/html/ref-docEdition.html) (document edition) contains an edition statement as presented on a title page of a document. +- [docImprint](http://www.tei-c.org/release/doc/tei-p5-doc/en/html/ref-docImprint.html) (document imprint) contains the imprint statement (place and date of publication, publisher name), as given (usually) at the foot of a title page. +- [docDate](http://www.tei-c.org/release/doc/tei-p5-doc/en/html/ref-docDate.html) (document date) contains the date of a document, as given on a title page or in a dateline. +- [graphic](http://www.tei-c.org/release/doc/tei-p5-doc/en/html/ref-graphic.html) indicates the location of a graphic or illustration, either forming part of a text, or providing an image of it. + + +### Body + +The <body> element, with its myriad of child elements, contains the text proper: be it a novel, a collection of poems, a play, a letter, etc, as described in chapter [4 Default Text Structure ](http://www.tei-c.org/release/doc/tei-p5-doc/en/html/DS.html "7")of the Guidelines. Not only can the structure of these texts be encoded, but other textual information or features can be included, for example, notes, the position of illustrations, additions and deletions in manuscripts, or the presence of multiple hands in the creation of a manuscript. + +The TEI tends to favour less descriptive tags which can serve a variety of situations which semantically, represent similar structures. For example, you will not find a tag for poetry called verse, stanza, couplet, or octave. Rather, the TEI provides the <lg> or line group element which acts as a less semantically specific container for all these types of divisions within a poem. If a particular project would like to specify further the nature of the <lg>, this can be done via the 'type' attribute: e.g. <lg type="couplet"> + +The same principle holds true for encoding books. A book may be structured into several chapters. In TEI this chapter structure can be expressed with the <div> (division) element. Any arbitrary division can be encoded using this element. Again, by using the @type attribute more granularity in the exact nature of division can indicated, as in the following example: + +| ``` 1 2 3 4 5 6 7 8 ``` | ``` <body> <div n="1" type="chapter"> div> <div n="2" type="chapter"> div> <div n="3" type="chapter"> div> body> ``` | +|---|---| + + + + + +Figure 2: An example of encoding chapters using the <div> element. + +### Back matter + +The third section and last section of <text> is the <back> element in which back matter is encoded. Back matter is typical of published texts, and appendixes, indexes, etc. following the main part of a text. Further details about this section can be found in chapter [4.7 Back Matter ](http://www.tei-c.org/release/doc/tei-p5-doc/en/html/DS.html#DSBACK "Back Matter")of the Guidelines. + +Child elements of <back> are: + +- **appendix**: an ancillary self-contained section of a work, often providing additional but in some sense extra-canonical text. +- **glossary**: a list of terms associated with definition texts (‘glosses’): this should be encoded as a <list type="gloss"> (see section 3.7 Lists). +- notes: a section in which textual or other kinds of notes are gathered together. +- **bibliogr**: a list of bibliographic citations: this should be encoded as a listBibl +- **index**: any form of index to the work. +- **colophon**: a statement appearing at the end of a book describing the conditions of its physical production. + +Figure 3 is an example of how to encode back matter: + +| ``` 1 2 3 4 5 6 7 8 9 10 11 12 13 ``` | ``` <div> <head>Appendixhead> <p>Here goes the text of the appendix text...p> div> <div> <head>Indexhead> First Index Entry Second Index Entry div> ``` | +|---|---| + + + + + +Fig 3: Encoding of back matter. ### Core Elements + +There is a set of elements that can be useful for the encoding of different types of texts and are therefore useful for multiple TEI modules. These are called TEI Core Elements and are described in a separate chapter of the TEI Guidelines ([3 Elements Available in All TEI Documents](http://www.tei-c.org/release/doc/tei-p5-doc/en/html/CO.html)). These elements have various functions. Some are structural elements such as <p> (used to encode paragraphs). Others are used to encode highlighted (<hi>) or quoted (<q>) parts of a text. Others indicate editorial changes (the <reg> and <abbr> elements) or notes or annotations (<note>). An example of this is in figure 4: + +| ``` 1 2 3 4 5 6 7 ``` | ``` <body> <head>The Hobbithead> <p>In a hole in the groundThis is an editorial note. there lived a hobbit. Not a nasty, dirty, wet hole, filled with the ends of worms and an oozy smell, nor yet a dry, bare, ... p> body> ``` | +|---|---| + + + + + +Figure 4: TEI core elements ### Further reading + +Module 3: Prose. TEI by Example. <[http://teibyexample.org/modules/TBED03v00.htm](http://teibyexample.org/modules/TBED03v00.htm)> + + + + + + + + + +Customising the TEI + + + + +TEI has over 500 elements for the encoding of many types of data. No single text encoding project will use all these elements, hence the TEI Consortium has created a number of subsets (modules) of the full TEI schema for certain data types. For instance, the [TEI Lite](http://www.tei-c.org/Guidelines/Customization/Lite/) schema is a good starting point for many TEI projects as it contains the most frequently used TEI elements. Other modules include specialised tags for dictionaries, manuscript description, verse, and performance texts. + +While these modules cover a wide variety textual types, projects may still need to customise the TEI to better meet their needs. For instance a schema needs to be more restricted in order to ensure consistent encoding or the TEI needs to be extended so that elements from other XML standards or custom elements are added. The TEI encourages this type of customisation, with two caveats: + +- three modules are mandatory for all TEI customisations: core, header and textstructure; +- all changes to the TEI schema are documented through a mechanism called ODD One Document Does it all). + +ODD is the XML vocabulary which the TEI system uses to describe itself. This includes creating appropriate documentation as well as the schema specifications about the elements and attributes used. Therefore, ODD is used to generate both technical documentation in form of a DTD or RELAX NG and written documentation of the elements and attributes. + +The preferred way to create a TEI cusomisation is via an online tool that provides an interface that makes it easier to create and edit project-specific customisations called [Roma](http://www.tei-c.org/Roma/). Roma allows one to chose elements and attributes and add custom elements and attributes. With Roma you can also validate your custom schema to make sure it still conforms to the TEI rules, you may save your customisation as well as export a DTD schema and HTML documentation. + +| | +|---| + +### Further reading: + +Getting Started with ODDs. <[http://www.tei-c.org/Guidelines/Customization/odds.xml](http://www.tei-c.org/Guidelines/Customization/odds.xml)> + +TEI: Using Roma. <[http://www.tei-c.org/Roma/](http://www.tei-c.org/Roma/)> + + + + + + + + + +Simple TEI Exercise + + + + +### Exercise: + +1. Copy and paste the following code into a new XML file + + | ``` 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 ``` | ``` <!-- title of the electronic text -->

, <!-- title of the book -->, , .
``` | + |---|---| +2. Replace the comments in the <teiHeader> section with the following metadata: + +- Title: There and back again: an electronic edition +- Author: J.R.R. Tolkien +- Copyright: The copyright owner of the English translation is Tolkien, 1937 +- Repository: Digital Repository of my University +- The source: J.R.R. Tolkien. The Hobbit. Ballantine Books, New York. 1937. + +4. Validate the TEI document and make sure it is well-formed and valid! + +
+ +
+ + + + + +The TEI Consortium + + + + +The TEI Consortium is a not-for-profit membership organisation that has existed since 2000, although the TEI as a tagset and Guidelines has existed since 1987. The Guidelines have become the de facto standard for encoding textual material in the humanities and in cultural heritage. Its extensive, yet customisable tagset (several hundred elements) provides for the most basic of encoding (for example, the structure of a text in paragraphs, chapters, etc) to extremely nuanced and interpretative encoding of texts from dictionaries, to versions of poetry, to drama. The TEI, as expressed first in SGML and currently in XML can be used across such a wide variety of textual types and in a wide variety of publishing environments because of its being hardware and software independent. It also benefits from being community-driven. It is designed and developed by and for the scholarly research community (Burnard, 2015), and as such, is responsive to community needs. + +One of the main roles of The TEI Consortium is to develop and maintain guidelines for encoding of humanities data. These Guidelines were first developed for the encoding of text, but they have been extended to include other types of data. For example, the TEI Header can be used to describe multimedia objects. The TEI Guidelines provide detailed information on all TEI elements, as well as providing recommendations and examples on how they should be used. The Guidelines provides the single most comprehensive source about using the TEI and can be accessed [online](http://www.tei-c.org/Guidelines/P5/ "TEI Guidelines P5"). + +TEI elements are organised in modules. The 'core' module contains elements essential to most encoding projects (including paragraphs, quotations, lines and stanzas of poetry, simple links, etc) and are described in [Chapter Three](http://www.tei-c.org/release/doc/tei-p5-doc/en/html/CO.html "Chapter 3 TEI Guidelines") of the Guidelines. Additionally, there are modules for the encoding of specialised text types, such as prose, poetry, manuscripts, dictionaries. In this unit we will cover basic TEI rules and some of the more common/important TEI tags described in the Guidelines. An alphabetical list of all [TEI Elements](http://www.tei-c.org/release/doc/tei-p5-doc/en/html/REF-ELEMENTS.html "TEI List of Elements") and how they are used is probably the easiest way to understand how a particular element is used. + +The following video by Lou Burnard, one of the founding editors of the TEI, is an excellent induction to the TEI and its history. + + + +### Further reading + +Lou Burnard. What is the Text Encoding Initiative? How to add intelligent markup to digital resources. Marseille. 2014. <[http://books.openedition.org/oep/426](http://books.openedition.org/oep/426)>. + +‘TEI: P5 Guidelines’, *Text Encoding Initiative* <[http://www.tei-c.org/Guidelines/P5/](http://www.tei-c.org/Guidelines/P5/)> + + + + + +
+ + +## Yeats exercise: encoding poetry with TEI + + + + +Yeats exercise: encoding poetry with TEI + + + +In this exercise you will encode a poem and its associated bibliographic information in the <teiHeader>. This exercise is designed to be completed within the oXygen editor as much of the instructions indicate how to encode and validate your XML document using this software. + +All the files you need for this exercise are included in the zip file below. Save the zip file to your desktop. When you unzip the file, you will find four subfolders: + +- documents +- extras +- schemas +- stylesheets + +The **documents** folder contains one file: 'yeats\_template.xml. We have pre-populated much of the encoding needed for this poem. You will need to complete the rest of the encoding. Instructions on how to complete the poem are included in the **extras** folder (Instructions on encoding the Yeats poem.pdf). Also in the extras folder are + +- the text for the poem +- a challenge exercise (instructions for which are in section F of the instructions PDF) + +The **schemas** folder contains the teilite schema, which the poem will be validated against. The **stylesheets** folder similarly contains the stylesheet (yeats.css) that is used for this exercise. + +Before you begin the exercise, watch the following video which provides you with information that should help you from making common mistakes and will explain further the file structure. + + + + + + + +## Franklin exercise: encode prose with TEI + + + + +Franklin exercise: encode prose with TEI + + + +Download the following zip file containing the exercise documents. You will find the instructions in the folder 'handouts'. + + + + + +## Specialist encoding + + + + + + + +Encoding of correspondence with the TEI + + + + +Over the last decade a great number of letter and correspondence encoding projects have been using TEI. For instance, the DALF: Digital Archive of Letters in Flanders ([http://ctb.kantl.be/project/dalf/](http://ctb.kantl.be/project/dalf/)), The Van Gogh Letters ([http://vangoghletters.org/vg/](http://vangoghletters.org/vg/)) and others. The Text Encoding Initiative has its own Special Interest Group that develops standards and best-practice guidelines for the encoding of correspondence with TEI ([http://wiki.tei-c.org/index.php/SIG:Correspondence](http://wiki.tei-c.org/index.php/SIG:Correspondence)). + +When we designed this course, we also worked on a correspondence: The Letters of 1916 project ([http://letters1916.maynoothuniversity.ie/](http://letters1916.maynoothuniversity.ie/)) and therefore we wanted to include a section on the encoding example of this genre into this course. + +### Correspondence specific header elements + +For the encoding of correspondence metadata you use essentially the same TEI header as for other texts. However, in addition to the already mentioned elements that can be used in the TEI header, the TEI provides a set of other elements that were specifically designed for the description of correspondence. These are the <correspDesc> element and its child elements. + +The <correspDesc> element was introduced to provide further description about the correspondence such as who sent a letter to whom, from where was it sent. + +The <correspDesc> element has two child elements, <correspAction> and <correspContext>. The first is used to store a structured description of the place a letter is from, the name of the person that sent a letter and the date it was send or any other action related to the correspondence. The <correspContext> element on the other hand provides references to letters or other correspondence that was sent prior or afterwards and is somehow related to the letter. + +| ``` 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 ``` | ``` Major E. Reade Dartmoor Prison 05 July 1916 Mabel FitzGerald Bray, Co. Wicklow, Ireland unknown Previous letter from Major E. Reade to Mabel FitzGerald: 20 June 1916 Next letter of Major E. Reade to Mabel FitzGerald: 10 July 1916 ``` | +|---|---| + + + + + +For the transcription of the letter a number of additional elements can be used that are genre specific for the tagging of correspondence features. As we have learned earlier in this course the transcribed text is within the <text> and <body> section of the TEI document. There are specific TEI elements for the encoding of opener, closer and postscripts and they can be nested in the following way: + +| ``` 1 2 3 4 5 6 7 8 9 10 ``` | ``` The opener of the letter goes here

The transcription of the letter goes here

The closer of the letter goes here

Postscripts (P.S.) goes here

``` | +|---|---| + + + + + + +Within the <opener> address information, the date and the salutation can be encoded using suitable elements such as <address>, <dateline> and <salute>: + +| ``` 1 2 3 4 5 6 7 8 ``` | ```
H.M.Prison, Dartmoor
5 July 1916. Madam T.D.Fitzgerald
``` | +|---|---| + + + + + +Similarly, the <closer> can nest elements for further structure, such as salute, signed, dateline, etc.: + +| ``` 1 2 3 4 5 ``` | ``` I am, Madam, Your obedient Servant, E. Reade Dartmoor, 5 July ``` | +|---|---| + + + + + + +Between the opener and the closer is the actual message of a letter and this message can be encoded using <p> tag which indicate paragraphs. Further information on how to use the opener, the closer and other relevant tags can be found together with detailed examples in the TEI guidelines. To familiarise yourself with the TEI guidelines, please have a look at the following two sections now: + +2.4.6 Correspondence Description. TEI P5 Guidelines. <[http://www.tei-c.org/release/doc/tei-p5-doc/en/html/HD.html#HD44CD](http://www.tei-c.org/release/doc/tei-p5-doc/en/html/HD.html#HD44CD)> + +4.2.2 Openers and Closers. TEI P5 Guidelines. <[http://www.tei-c.org/release/doc/tei-p5-doc/en/html/DS.html#DSOC](http://www.tei-c.org/release/doc/tei-p5-doc/en/html/DS.html#DSOC)> + +### Further reading + +Peter Stadler, Marcel Illetschko, and Sabine Seifert. Towards a Model for Encoding Correspondence in the TEI: Developing and Implementing <correspDesc>. Journal of the Text Encoding Initiative. Issue 9. 2016. <[http://jtei.revues.org/1433](http://jtei.revues.org/1433)> + +
+ +
+ +
+ diff --git a/lib/content/mdx/components.tsx b/lib/content/mdx/components.tsx index 79d6622b2..5890fa9e7 100644 --- a/lib/content/mdx/components.tsx +++ b/lib/content/mdx/components.tsx @@ -1,3 +1,5 @@ +import type { ReactNode } from "react"; + import { Callout } from "@/components/content/callout"; import { Diagram, DiagramCaption, DiagramCodeBlock } from "@/components/content/diagram"; import { Disclosure } from "@/components/content/disclosure"; @@ -44,6 +46,143 @@ export const components = { Tabs, Video, VideoCard, + + /** + * DARIAH-teach components. + * + * Most of these should be dropped in another transform step for the actual content migration. + */ + Page(props: Readonly<{ children: ReactNode; id: string; moduleId: string }>): ReactNode { + const { children } = props; + + return children; + }, + PageTitle(_props: Readonly<{ children: ReactNode }>): ReactNode { + // const { children } = props; + + // TODO: Avoid multiple h1 elements. + // return

{children}

; + + // FIXME: currently titles are duplicated in markdown + return null; + }, + PageIntro(props: Readonly<{ children: ReactNode }>): ReactNode { + const { children } = props; + + // eslint-disable-next-line tailwindcss/no-custom-classname + return
{children}
; + }, + PageContent(props: Readonly<{ children: ReactNode }>): ReactNode { + const { children } = props; + + return children; + }, + Resource( + props: Readonly<{ + children: ReactNode; + title: string; + id: string; + moduleId: string; + files: Array<{ file: string }>; + }>, + ): ReactNode { + const { files } = props; + + return ( + + ); + }, + IframeElement(props: Readonly<{ w: number; h: number; alt: string; src: string }>): ReactNode { + const { src } = props; + + if (src.startsWith("https://www.youtube.com/embed/")) { + const id = new URL(src).pathname.split("/").pop(); + + if (id != null && id !== "") { + return