jgm · yanntrividic · Jun 23, 2025 · Jun 23, 2025 · Jun 23, 2025 · Jun 23, 2025
diff --git a/src/Text/Pandoc/Readers/DocBook.hs b/src/Text/Pandoc/Readers/DocBook.hs
@@ -46,7 +46,7 @@ import Text.Pandoc.Builder
 import Text.Pandoc.Class.PandocMonad (PandocMonad, report)
 import Text.Pandoc.Options
 import Text.Pandoc.Logging (LogMessage(..))
-import Text.Pandoc.Shared (safeRead, extractSpaces)
+import Text.Pandoc.Shared (safeRead, extractSpaces, addPandocAttributes)
 import Text.Pandoc.Sources (ToSources(..), sourcesToText)
 import Text.Pandoc.Transforms (headerShift)
 import Text.TeXMath (readMathML, writeTeX)
@@ -855,15 +855,19 @@ getBlocks :: PandocMonad m => Element -> DB m Blocks
 getBlocks e =  mconcat <$>
                  mapM parseBlock (elContent e)
 
+getRoleAttr :: Element -> [(Text, Text)] -- extract role attribute and add it to the attribute list
+getRoleAttr e = case attrValue "role" e of
+                  "" -> []
+                  r  -> [("role", r)]
 
 parseBlock :: PandocMonad m => Content -> DB m Blocks
 parseBlock (Text (CData CDataRaw _ _)) = return mempty -- DOCTYPE
 parseBlock (Text (CData _ s _)) = if T.all isSpace s
                                      then return mempty
                                      else return $ plain $ trimInlines $ text s
 parseBlock (CRef x) = return $ plain $ str $ T.toUpper x
-parseBlock (Elem e) =
-  case qName (elName e) of
+parseBlock (Elem e) = do
+  parsedBlock <- case qName (elName e) of
         "toc"   -> skip -- skip TOC, since in pandoc it's autogenerated
         "index" -> skip -- skip index, since page numbers meaningless
         "para"  -> parseMixed para (elContent e)
@@ -975,6 +979,7 @@ parseBlock (Elem e) =
         "title" -> return mempty     -- handled in parent element
         "subtitle" -> return mempty  -- handled in parent element
         _ -> skip >> getBlocks e
+  return $ addPandocAttributes (getRoleAttr e) parsedBlock
    where skip = do
            let qn = qName $ elName e
            let name = if "pi-" `T.isPrefixOf` qn
@@ -1112,7 +1117,12 @@ parseBlock (Elem e) =
            modify $ \st -> st{ dbSectionLevel = n }
            b <- getBlocks e
            modify $ \st -> st{ dbSectionLevel = n - 1 }
-           return $ headerWith (elId, classes, maybeToList titleabbrevElAsAttr++attrs) n' headerText <> b
+           let content = headerWith (elId, classes, maybeToList titleabbrevElAsAttr)
+                          n' headerText <> b
+           return $ case attrValue "role" e of
+                      "" -> content
+                      _  -> divWith ("", ["section"],
+                             ("level", T.pack $ show n') : attrs) content
          titleabbrevElAsAttr =
            case filterChild (named "titleabbrev") e `mplus`
                 (filterChild (named "info") e >>=
@@ -1135,9 +1145,8 @@ parseBlock (Elem e) =
            b <- p
            case mbt of
              Nothing -> return b
-             Just t -> return $ divWith (attrValue "id" e,[],[])
+             Just t -> return $ divWith (attrValue "id" e, [], getRoleAttr e)
                          (divWith ("", ["title"], []) (plain t) <> b)
-
          -- Admonitions are parsed into a div. Following other Docbook tools that output HTML,
          -- we parse the optional title as a div with the @title@ class, and give the
          -- block itself a class corresponding to the admonition name.
@@ -1226,8 +1235,8 @@ parseInline (Text (CData _ s _)) = do
      else return $ text s
 parseInline (CRef ref) =
   return $ text $ fromMaybe (T.toUpper ref) $ lookupEntity ref
-parseInline (Elem e) =
-  case qName (elName e) of
+parseInline (Elem e) = do
+  parsedInline <- case qName (elName e) of
         "anchor" -> do
            return $ spanWith (attrValue "id" e, [], []) mempty
         "phrase" -> do
@@ -1349,6 +1358,9 @@ parseInline (Elem e) =
         -- <?asciidor-br?> to in handleInstructions, above.
         "pi-asciidoc-br" -> return linebreak
         _          -> skip >> innerInlines id
+  return $ case qName (elName e) of
+    "emphasis" -> parsedInline
+    _ -> addPandocAttributes (getRoleAttr e) parsedInline
    where skip = do
            let qn = qName $ elName e
            let name = if "pi-" `T.isPrefixOf` qn

diff --git a/test/docbook-reader.docbook b/test/docbook-reader.docbook
@@ -27,9 +27,9 @@
   This is a set of tests for pandoc. Most of them are adapted from John
   Gruber’s markdown test suite.
 </para>
-<sect1 id="headers">
+<sect1 id="headers" role="sect1role">
   <title>Headers</title>
-  <sect2 id="level-2-with-an-embedded-link">
+  <sect2 id="level-2-with-an-embedded-link" role="sect2role">
     <title>Level 2 with an <ulink url="/url">embedded link</ulink></title>
     <sect3 id="level-3-with-emphasis">
       <title>Level 3 with <emphasis>emphasis</emphasis></title>
@@ -74,6 +74,9 @@
   <para>
     Here’s a regular paragraph.
   </para>
+  <para role="pararole">
+    And here’s a regular paragraph with a role.
+  </para>
   <para>
     In Markdown 1.0.0 and earlier. Version 8. This line turns into a list
     item. Because a hard-wrapped line in the middle of a paragraph looked like
@@ -93,6 +96,11 @@
       This is a block quote. It is pretty short.
     </para>
   </blockquote>
+  <blockquote role="roleblockquote">
+    <para>
+      This is a block quote with a role.
+    </para>
+  </blockquote>
   <blockquote>
     <para>
       Code in a block quote:
@@ -233,6 +241,26 @@ These should not be escaped:  \$ \\ \&gt; \[ \{
         </para>
       </listitem>
     </orderedlist>
+    <para>
+      with role:
+    </para>
+    <orderedlist role="listrole" numeration="arabic">
+      <listitem>
+        <para>
+          First
+        </para>
+      </listitem>
+      <listitem>
+        <para>
+          Second
+        </para>
+      </listitem>
+      <listitem>
+        <para>
+          Third
+        </para>
+      </listitem>
+    </orderedlist>
     <para>
       and tight:
     </para>
@@ -702,6 +730,12 @@ These should not be escaped:  \$ \\ \&gt; \[ \{
   <para>
     So is <emphasis role="strong"><emphasis>this</emphasis></emphasis> word.
   </para>
+  <para>
+    So is <emphasis role="emphasisrole"><emphasis>this</emphasis></emphasis> word with a role.
+  </para>
+  <para>
+    So is <phrase role="phraserole"><phrase>this</phrase></phrase> phrase with a role.
+  </para>
   <para>
     This is code: <literal>&gt;</literal>, <literal>$</literal>,
     <literal>\</literal>, <literal>\$</literal>,
@@ -1408,7 +1442,7 @@ or here: &lt;http://example.com/&gt;
   <para>
     Table with attributes
   </para>
-  <table xml:id="mytableid1" class="mytableclass1 mytableclass2" tabstyle="mytabstyle1">
+  <table xml:id="mytableid1" class="mytableclass1 mytableclass2" tabstyle="mytabstyle1" role="tablerole1">
     <title>
       Attribute table caption
     </title>
@@ -1444,7 +1478,7 @@ or here: &lt;http://example.com/&gt;
   <para>
     Table with attributes, without caption
   </para>
-  <informaltable xml:id="mytableid2" class="mytableclass3 mytableclass4" tabstyle="mytabstyle2">
+  <informaltable xml:id="mytableid2" class="mytableclass3 mytableclass4" tabstyle="mytabstyle2" role="tablerole2">
     <tgroup>
       <thead>
 	<th>