From f9cd7a80371f6d8275fe095115399aae3489a8c0 Mon Sep 17 00:00:00 2001 From: Anthony Johnson Date: Fri, 13 Jun 2025 15:33:04 -0700 Subject: [PATCH] Add support for subproject and per-version sitemaps, and styled main sitemap This was a little morning project to expand our default sitemap to make allow projects to create more granular sitemaps. The idea here is that the main sitemap becomes a `sitemapindex` sitemap that points to multiple individual sitemap files. These include: - A sitemap of version URLs (this is the current prod sitemap) - A sitemap for each version (new) - A sitemap for each subproject version (new) The new sitemaps will 404 if the user doesn't output them. And in addition, this replaces the XML and XML comment approach, which doesn't render in many browsers, with a super basic XSLT template. The template isn't necessiarly needed but it's a nicer experience for browsers. --- readthedocs/core/static/core/xsl/sitemap.xslt | 98 +++++++++++++++++ readthedocs/proxito/views/serve.py | 102 +++++++++++++++--- readthedocs/templates/sitemap.xml | 71 +++++++----- 3 files changed, 233 insertions(+), 38 deletions(-) create mode 100644 readthedocs/core/static/core/xsl/sitemap.xslt diff --git a/readthedocs/core/static/core/xsl/sitemap.xslt b/readthedocs/core/static/core/xsl/sitemap.xslt new file mode 100644 index 00000000000..d8152ed3222 --- /dev/null +++ b/readthedocs/core/static/core/xsl/sitemap.xslt @@ -0,0 +1,98 @@ + + + + + + + + + + +
+

Documentation sitemap

+ +

+ This sitemap is autogenerated by Read the Docs. For projects with + subprojects, this sitemap links to the sitemap for each individual + subproject. For projects without subprojects, this sitemap links to + the project's public versions, sorted by version number. +

+ +

Links

+ +
    + + + +
  • + URL: + + + () + +
  • +
    + + + +
  • + Sitemap: + + + () + +
  • +
    +
+ +

Learn more

+ +
+
+ Sitemap documentation +
+
+ How sitemaps are generated and how to use and customize them for + your projects. +
+
+ Our guide about SEO techniques +
+
+ How sitemaps affect SEO and best practices to indexing your + documentation content. +
+
+
+ + +
+
diff --git a/readthedocs/proxito/views/serve.py b/readthedocs/proxito/views/serve.py index bf617427a8a..1939d81dd90 100644 --- a/readthedocs/proxito/views/serve.py +++ b/readthedocs/proxito/views/serve.py @@ -28,11 +28,13 @@ from readthedocs.core.unresolver import unresolver from readthedocs.core.utils.extend import SettingsOverrideObject from readthedocs.core.utils.requests import is_suspicious_request +from readthedocs.core.utils.url import unsafe_join_url_path from readthedocs.projects.constants import OLD_LANGUAGES_CODE_MAPPING from readthedocs.projects.constants import PRIVATE from readthedocs.projects.models import Domain from readthedocs.projects.models import Feature from readthedocs.projects.models import HTMLFile +from readthedocs.projects.models import Project from readthedocs.projects.templatetags.projects_tags import sort_version_aware from readthedocs.proxito.constants import RedirectType from readthedocs.proxito.exceptions import ContextualizedHttp404 @@ -780,7 +782,7 @@ class ServeRobotsTXT(SettingsOverrideObject): class ServeSitemapXMLBase(CDNCacheControlMixin, CDNCacheTagsMixin, View): - """Serve sitemap.xml from the domain's root.""" + """Serve a sitemap from the project root.""" # Always cache this view, since it's the same for all users. cache_response = True @@ -789,7 +791,90 @@ class ServeSitemapXMLBase(CDNCacheControlMixin, CDNCacheTagsMixin, View): def get(self, request): """ - Generate and serve a ``sitemap.xml`` for a particular ``project``. + Generate and serve project sitemap. + + Without any params, generate a sitemap index pointing to a sitemap for + each version of this project and it's subprojects. + """ + context = {} + if "versions" in request.GET: + context["versions"] = self.get_urlset(request) + else: + context["sitemaps"] = self.get_sitemapindex(request) + return render( + request, + "sitemap.xml", + context, + content_type="application/xml", + ) + + def get_sitemapindex(self, request): + """ + Generate and serve a sitemapindex sitemap pointing to other sitemaps. + + This will point towards: + + - Project urlset sitemap for links to all versions, using :py:method:`get_urlset` + - Sitemaps for each project version + - Sitemaps for each subproject + """ + + def get_public_versions(project, resolver=None): + if resolver is None: + resolver = Resolver() + public_versions = Version.internal.public( + project=project, + only_active=True, + include_hidden=False, + ) + for version in public_versions: + prefix = resolver.resolve( + project=project, + version_slug=version.slug, + ) + parsed_prefix = urlparse(prefix) + sitemap_url = parsed_prefix._replace( + path=unsafe_join_url_path(parsed_prefix.path, "sitemap.xml"), + ).geturl() + + sitemap = { + "loc": sitemap_url, + } + # Version can be enabled, but not ``built`` yet. We want to show the + # link without a ``lastmod`` attribute + last_build = version.builds.order_by("-date").first() + if last_build: + sitemap["lastmod"] = last_build.date.isoformat() + + yield sitemap + + # TODO is it important to share this between lookups? I saw other + # attempts to share this, I'm guessing to decrease lookup latency? + resolver = Resolver() + + sitemaps = [] + + project = request.unresolved_domain.project + public_subprojects = Project.objects.public().filter(superprojects__parent=project) + + # Version urlset list first. This is useful if the project doesn't + # output a sitemap per version. + sitemaps.append( + { + "loc": request.build_absolute_uri() + "?versions", + } + ) + + # Links to subproject and project versioned sitemaps + sitemaps.extend(get_public_versions(project, resolver=resolver)) + for subproject in public_subprojects: + sitemaps.extend(get_public_versions(subproject, resolver=resolver)) + + return sitemaps + + def get_urlset(self, request): + """ + Generate and serve a urlset sitemap for all public project versions. The sitemap is generated from all the ``active`` and public versions of ``project``. These versions are sorted by using semantic versioning @@ -846,11 +931,13 @@ def changefreqs_generator(): yield from itertools.chain(changefreqs, itertools.repeat("monthly")) project = request.unresolved_domain.project + public_versions = Version.internal.public( project=project, only_active=True, include_hidden=False, ) + if not public_versions.exists(): raise Http404() @@ -919,16 +1006,7 @@ def changefreqs_generator(): ) versions.append(element) - - context = { - "versions": versions, - } - return render( - request, - "sitemap.xml", - context, - content_type="application/xml", - ) + return versions def _get_project(self): # Method used by the CDNCacheTagsMixin class. diff --git a/readthedocs/templates/sitemap.xml b/readthedocs/templates/sitemap.xml index 70135851fe2..978c698fb64 100644 --- a/readthedocs/templates/sitemap.xml +++ b/readthedocs/templates/sitemap.xml @@ -1,30 +1,49 @@ - - - {% for version in versions %} - - {{ version.loc }} - {% for language in version.languages %} - +{# Use template for human readable version and informational content #} + + +{% comment %} + Sitemaps don't have a shared top level entity so we can only show one of these at a time: + + - A ``sitemapindex`` pointing to other sitemaps + - A ``urlset`` pointing to URLs for project public versions + + The view handles how to show one list or the other. +{% endcomment %} + +{% if sitemaps %} + + {% for sitemap in sitemaps %} + + {{ sitemap.loc }} + {% if sitemap.lastmod %} + {{ sitemap.lastmod }} + {% endif %} + + {% endfor %} + +{% elif versions %} + + {% for version in versions %} + + {{ version.loc }} + {% for language in version.languages %} + + {% endfor %} + {% if version.lastmod %} + {{ version.lastmod }} + {% endif %} + {{ version.changefreq }} + {{ version.priority }} + {% endfor %} - {% if version.lastmod %} - {{ version.lastmod }} - {% endif %} - {{ version.changefreq }} - {{ version.priority }} - - {% endfor %} - + +{% endif %}