initial commit

2026-02-05 14:06:23 -06:00 · 2024-10-30 11:19:09 -03:00
commit 8654a31a4d
3744 changed files with 585542 additions and 0 deletions
@@ -0,0 +1,540 @@
+# $Id: frontmatter.py 9552 2024-03-08 23:41:31Z milde $
+# Author: David Goodger, Ueli Schlaepfer <goodger@python.org>
+# Copyright: This module has been placed in the public domain.
+
+"""
+Transforms_ related to the front matter of a document or a section
+(information found before the main text):
+
+- `DocTitle`: Used to transform a lone top level section's title to
+  the document title, promote a remaining lone top-level section's
+  title to the document subtitle, and determine the document's title
+  metadata (document['title']) based on the document title and/or the
+  "title" setting.
+
+- `SectionSubTitle`: Used to transform a lone subsection into a
+  subtitle.
+
+- `DocInfo`: Used to transform a bibliographic field list into docinfo
+  elements.
+
+.. _transforms: https://docutils.sourceforge.io/docs/api/transforms.html
+"""
+
+__docformat__ = 'reStructuredText'
+
+import re
+
+from docutils import nodes, parsers, utils
+from docutils.transforms import TransformError, Transform
+
+
+class TitlePromoter(Transform):
+
+    """
+    Abstract base class for DocTitle and SectionSubTitle transforms.
+    """
+
+    def promote_title(self, node):
+        """
+        Transform the following tree::
+
+            <node>
+                <section>
+                    <title>
+                    ...
+
+        into ::
+
+            <node>
+                <title>
+                ...
+
+        `node` is normally a document.
+        """
+        # Type check
+        if not isinstance(node, nodes.Element):
+            raise TypeError('node must be of Element-derived type.')
+
+        # `node` must not have a title yet.
+        assert not (len(node) and isinstance(node[0], nodes.title))
+        section, index = self.candidate_index(node)
+        if index is None:
+            return False
+
+        # Transfer the section's attributes to the node:
+        # NOTE: Change `replace` to False to NOT replace attributes that
+        #       already exist in node with those in section.
+        # NOTE: Remove `and_source` to NOT copy the 'source'
+        #       attribute from section
+        node.update_all_atts_concatenating(section, replace=True,
+                                           and_source=True)
+
+        # setup_child is called automatically for all nodes.
+        node[:] = (section[:1]        # section title
+                   + node[:index]     # everything that was in the
+                                      # node before the section
+                   + section[1:])     # everything that was in the section
+        assert isinstance(node[0], nodes.title)
+        return True
+
+    def promote_subtitle(self, node):
+        """
+        Transform the following node tree::
+
+            <node>
+                <title>
+                <section>
+                    <title>
+                    ...
+
+        into ::
+
+            <node>
+                <title>
+                <subtitle>
+                ...
+        """
+        # Type check
+        if not isinstance(node, nodes.Element):
+            raise TypeError('node must be of Element-derived type.')
+
+        subsection, index = self.candidate_index(node)
+        if index is None:
+            return False
+        subtitle = nodes.subtitle()
+
+        # Transfer the subsection's attributes to the new subtitle
+        # NOTE: Change `replace` to False to NOT replace attributes
+        #       that already exist in node with those in section.
+        # NOTE: Remove `and_source` to NOT copy the 'source'
+        #       attribute from section.
+        subtitle.update_all_atts_concatenating(subsection, replace=True,
+                                               and_source=True)
+
+        # Transfer the contents of the subsection's title to the
+        # subtitle:
+        subtitle[:] = subsection[0][:]
+        node[:] = (node[:1]       # title
+                   + [subtitle]
+                   # everything that was before the section:
+                   + node[1:index]
+                   # everything that was in the subsection:
+                   + subsection[1:])
+        return True
+
+    def candidate_index(self, node):
+        """
+        Find and return the promotion candidate and its index.
+
+        Return (None, None) if no valid candidate was found.
+        """
+        index = node.first_child_not_matching_class(
+            nodes.PreBibliographic)
+        if (index is None or len(node) > (index + 1)
+            or not isinstance(node[index], nodes.section)):
+            return None, None
+        else:
+            return node[index], index
+
+
+class DocTitle(TitlePromoter):
+
+    """
+    In reStructuredText_, there is no way to specify a document title
+    and subtitle explicitly. Instead, we can supply the document title
+    (and possibly the subtitle as well) implicitly, and use this
+    two-step transform to "raise" or "promote" the title(s) (and their
+    corresponding section contents) to the document level.
+
+    1. If the document contains a single top-level section as its first
+       element (instances of `nodes.PreBibliographic` are ignored),
+       the top-level section's title becomes the document's title, and
+       the top-level section's contents become the document's immediate
+       contents. The title is also used for the <document> element's
+       "title" attribute default value.
+
+    2. If step 1 successfully determines the document title, we
+       continue by checking for a subtitle.
+
+       If the lone top-level section itself contains a single second-level
+       section as its first "non-PreBibliographic" element, that section's
+       title is promoted to the document's subtitle, and that section's
+       contents become the document's immediate contents.
+
+    Example:
+       Given this input text::
+
+           =================
+            Top-Level Title
+           =================
+
+           Second-Level Title
+           ~~~~~~~~~~~~~~~~~~
+
+           A paragraph.
+
+       After parsing and running the DocTitle transform, the result is::
+
+           <document names="top-level title">
+               <title>
+                   Top-Level Title
+               <subtitle names="second-level title">
+                   Second-Level Title
+               <paragraph>
+                   A paragraph.
+
+       (Note that the implicit hyperlink target generated by the
+       "Second-Level Title" is preserved on the <subtitle> element
+       itself.)
+
+    Any `nodes.PreBibliographic` instances occurring before the
+    document title or subtitle are accumulated and inserted as
+    the first body elements after the title(s).
+
+    .. _reStructuredText: https://docutils.sourceforge.io/rst.html
+    """
+
+    default_priority = 320
+
+    def set_metadata(self):
+        """
+        Set document['title'] metadata title from the following
+        sources, listed in order of priority:
+
+        * Existing document['title'] attribute.
+        * "title" setting.
+        * Document title node (as promoted by promote_title).
+        """
+        if not self.document.hasattr('title'):
+            if self.document.settings.title is not None:
+                self.document['title'] = self.document.settings.title
+            elif len(self.document) and isinstance(self.document[0],
+                                                   nodes.title):
+                self.document['title'] = self.document[0].astext()
+
+    def apply(self):
+        if self.document.settings.setdefault('doctitle_xform', True):
+            # promote_(sub)title defined in TitlePromoter base class.
+            if self.promote_title(self.document):
+                # If a title has been promoted, also try to promote a
+                # subtitle.
+                self.promote_subtitle(self.document)
+        # Set document['title'].
+        self.set_metadata()
+
+
+class SectionSubTitle(TitlePromoter):
+
+    """
+    This works like document subtitles, but for sections.  For example, ::
+
+        <section>
+            <title>
+                Title
+            <section>
+                <title>
+                    Subtitle
+                ...
+
+    is transformed into ::
+
+        <section>
+            <title>
+                Title
+            <subtitle>
+                Subtitle
+            ...
+
+    For details refer to the docstring of DocTitle.
+    """
+
+    default_priority = 350
+
+    def apply(self):
+        if not self.document.settings.setdefault('sectsubtitle_xform', True):
+            return
+        for section in self.document.findall(nodes.section):
+            # On our way through the node tree, we are modifying it
+            # but only the not-yet-visited part, so that the iterator
+            # returned by findall() is not corrupted.
+            self.promote_subtitle(section)
+
+
+class DocInfo(Transform):
+
+    """
+    This transform is specific to the reStructuredText_ markup syntax;
+    see "Bibliographic Fields" in the `reStructuredText Markup
+    Specification`_ for a high-level description. This transform
+    should be run *after* the `DocTitle` transform.
+
+    If the document contains a field list as the first element (instances
+    of `nodes.PreBibliographic` are ignored), registered bibliographic
+    field names are transformed to the corresponding DTD elements,
+    becoming child elements of the <docinfo> element (except for a
+    dedication and/or an abstract, which become <topic> elements after
+    <docinfo>).
+
+    For example, given this document fragment after parsing::
+
+        <document>
+            <title>
+                Document Title
+            <field_list>
+                <field>
+                    <field_name>
+                        Author
+                    <field_body>
+                        <paragraph>
+                            A. Name
+                <field>
+                    <field_name>
+                        Status
+                    <field_body>
+                        <paragraph>
+                            $RCSfile$
+            ...
+
+    After running the bibliographic field list transform, the
+    resulting document tree would look like this::
+
+        <document>
+            <title>
+                Document Title
+            <docinfo>
+                <author>
+                    A. Name
+                <status>
+                    frontmatter.py
+            ...
+
+    The "Status" field contained an expanded RCS keyword, which is
+    normally (but optionally) cleaned up by the transform. The sole
+    contents of the field body must be a paragraph containing an
+    expanded RCS keyword of the form "$keyword: expansion text $". Any
+    RCS keyword can be processed in any bibliographic field. The
+    dollar signs and leading RCS keyword name are removed. Extra
+    processing is done for the following RCS keywords:
+
+    - "RCSfile" expands to the name of the file in the RCS or CVS
+      repository, which is the name of the source file with a ",v"
+      suffix appended. The transform will remove the ",v" suffix.
+
+    - "Date" expands to the format "YYYY/MM/DD hh:mm:ss" (in the UTC
+      time zone). The RCS Keywords transform will extract just the
+      date itself and transform it to an ISO 8601 format date, as in
+      "2000-12-31".
+
+      (Since the source file for this text is itself stored under CVS,
+      we can't show an example of the "Date" RCS keyword because we
+      can't prevent any RCS keywords used in this explanation from
+      being expanded. Only the "RCSfile" keyword is stable; its
+      expansion text changes only if the file name changes.)
+
+    .. _reStructuredText: https://docutils.sourceforge.io/rst.html
+    .. _reStructuredText Markup Specification:
+       https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html
+    """
+
+    default_priority = 340
+
+    biblio_nodes = {
+          'author': nodes.author,
+          'authors': nodes.authors,
+          'organization': nodes.organization,
+          'address': nodes.address,
+          'contact': nodes.contact,
+          'version': nodes.version,
+          'revision': nodes.revision,
+          'status': nodes.status,
+          'date': nodes.date,
+          'copyright': nodes.copyright,
+          'dedication': nodes.topic,
+          'abstract': nodes.topic}
+    """Canonical field name (lowcased) to node class name mapping for
+    bibliographic fields (field_list)."""
+
+    def apply(self):
+        if not self.document.settings.setdefault('docinfo_xform', True):
+            return
+        document = self.document
+        index = document.first_child_not_matching_class(
+              nodes.PreBibliographic)
+        if index is None:
+            return
+        candidate = document[index]
+        if isinstance(candidate, nodes.field_list):
+            biblioindex = document.first_child_not_matching_class(
+                  (nodes.Titular, nodes.Decorative, nodes.meta))
+            nodelist = self.extract_bibliographic(candidate)
+            del document[index]         # untransformed field list (candidate)
+            document[biblioindex:biblioindex] = nodelist
+
+    def extract_bibliographic(self, field_list):
+        docinfo = nodes.docinfo()
+        bibliofields = self.language.bibliographic_fields
+        labels = self.language.labels
+        topics = {'dedication': None, 'abstract': None}
+        for field in field_list:
+            try:
+                name = field[0][0].astext()
+                normedname = nodes.fully_normalize_name(name)
+                if not (len(field) == 2 and normedname in bibliofields
+                        and self.check_empty_biblio_field(field, name)):
+                    raise TransformError
+                canonical = bibliofields[normedname]
+                biblioclass = self.biblio_nodes[canonical]
+                if issubclass(biblioclass, nodes.TextElement):
+                    if not self.check_compound_biblio_field(field, name):
+                        raise TransformError
+                    utils.clean_rcs_keywords(
+                          field[1][0], self.rcs_keyword_substitutions)
+                    docinfo.append(biblioclass('', '', *field[1][0]))
+                elif issubclass(biblioclass, nodes.authors):
+                    self.extract_authors(field, name, docinfo)
+                elif issubclass(biblioclass, nodes.topic):
+                    if topics[canonical]:
+                        field[-1] += self.document.reporter.warning(
+                            'There can only be one "%s" field.' % name,
+                            base_node=field)
+                        raise TransformError
+                    title = nodes.title(name, labels[canonical])
+                    title[0].rawsource = labels[canonical]
+                    topics[canonical] = biblioclass(
+                        '', title, classes=[canonical], *field[1].children)
+                else:
+                    docinfo.append(biblioclass('', *field[1].children))
+            except TransformError:
+                if len(field[-1]) == 1 \
+                       and isinstance(field[-1][0], nodes.paragraph):
+                    utils.clean_rcs_keywords(
+                        field[-1][0], self.rcs_keyword_substitutions)
+                # if normedname not in bibliofields:
+                classvalue = nodes.make_id(normedname)
+                if classvalue:
+                    field['classes'].append(classvalue)
+                docinfo.append(field)
+        nodelist = []
+        if len(docinfo) != 0:
+            nodelist.append(docinfo)
+        for name in ('dedication', 'abstract'):
+            if topics[name]:
+                nodelist.append(topics[name])
+        return nodelist
+
+    def check_empty_biblio_field(self, field, name):
+        if len(field[-1]) < 1:
+            field[-1] += self.document.reporter.warning(
+                  f'Cannot extract empty bibliographic field "{name}".',
+                  base_node=field)
+            return False
+        return True
+
+    def check_compound_biblio_field(self, field, name):
+        # Check that the `field` body contains a single paragraph
+        # (i.e. it must *not* be a compound element).
+        f_body = field[-1]
+        if len(f_body) == 1 and isinstance(f_body[0], nodes.paragraph):
+            return True
+        # Restore single author name with initial (E. Xampl) parsed as
+        # enumerated list
+        # https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#enumerated-lists
+        if (isinstance(f_body[0], nodes.enumerated_list)
+            and '\n' not in f_body.rawsource.strip()):
+            # parse into a dummy document and use created nodes
+            _document = utils.new_document('*DocInfo transform*',
+                                           field.document.settings)
+            parser = parsers.rst.Parser()
+            parser.parse('\\'+f_body.rawsource, _document)
+            if (len(_document.children) == 1
+                and isinstance(_document.children[0], nodes.paragraph)):
+                f_body.children = _document.children
+                return True
+        # Check failed, add a warning
+        content = [f'<{e.tagname}>' for e in f_body.children]
+        if len(content) > 1:
+            content = '[' + ', '.join(content) + ']'
+        else:
+            content = 'a ' + content[0]
+        f_body += self.document.reporter.warning(
+                      f'Bibliographic field "{name}"\nmust contain '
+                      f'a single <paragraph>, not {content}.',
+                      base_node=field)
+        return False
+
+    rcs_keyword_substitutions = [
+          (re.compile(r'\$' r'Date: (\d\d\d\d)[-/](\d\d)[-/](\d\d)[ T][\d:]+'
+                      r'[^$]* \$', re.IGNORECASE), r'\1-\2-\3'),
+          (re.compile(r'\$' r'RCSfile: (.+),v \$', re.IGNORECASE), r'\1'),
+          (re.compile(r'\$[a-zA-Z]+: (.+) \$'), r'\1')]
+
+    def extract_authors(self, field, name, docinfo):
+        try:
+            if len(field[1]) == 1:
+                if isinstance(field[1][0], nodes.paragraph):
+                    authors = self.authors_from_one_paragraph(field)
+                elif isinstance(field[1][0], nodes.bullet_list):
+                    authors = self.authors_from_bullet_list(field)
+                else:
+                    raise TransformError
+            else:
+                authors = self.authors_from_paragraphs(field)
+            authornodes = [nodes.author('', '', *author)
+                           for author in authors if author]
+            if len(authornodes) >= 1:
+                docinfo.append(nodes.authors('', *authornodes))
+            else:
+                raise TransformError
+        except TransformError:
+            field[-1] += self.document.reporter.warning(
+                f'Cannot extract "{name}" from bibliographic field:\n'
+                f'Bibliographic field "{name}" must contain either\n'
+                ' a single paragraph (with author names separated by one of '
+                f'"{"".join(self.language.author_separators)}"),\n'
+                ' multiple paragraphs (one per author),\n'
+                ' or a bullet list with one author name per item.\n'
+                'Note: Leading initials can cause (mis)recognizing names '
+                'as enumerated list.',
+                base_node=field)
+            raise
+
+    def authors_from_one_paragraph(self, field):
+        """Return list of Text nodes with author names in `field`.
+
+        Author names must be separated by one of the "autor separators"
+        defined for the document language (default: ";" or ",").
+        """
+        # @@ keep original formatting? (e.g. ``:authors: A. Test, *et-al*``)
+        text = ''.join(str(node)
+                       for node in field[1].findall(nodes.Text))
+        if not text:
+            raise TransformError
+        for authorsep in self.language.author_separators:
+            # don't split at escaped `authorsep`:
+            pattern = '(?<!\x00)%s' % authorsep
+            authornames = re.split(pattern, text)
+            if len(authornames) > 1:
+                break
+        authornames = (name.strip() for name in authornames)
+        return [[nodes.Text(name)] for name in authornames if name]
+
+    def authors_from_bullet_list(self, field):
+        authors = []
+        for item in field[1][0]:
+            if isinstance(item, nodes.comment):
+                continue
+            if len(item) != 1 or not isinstance(item[0], nodes.paragraph):
+                raise TransformError
+            authors.append(item[0].children)
+        if not authors:
+            raise TransformError
+        return authors
+
+    def authors_from_paragraphs(self, field):
+        for item in field[1]:
+            if not isinstance(item, (nodes.paragraph, nodes.comment)):
+                raise TransformError
+        authors = [item.children for item in field[1]
+                   if not isinstance(item, nodes.comment)]
+        return authors