From 715febde6458f320fab2889e5e8790e3b011a771 Mon Sep 17 00:00:00 2001 From: M Bussonnier Date: Thu, 29 Aug 2024 14:08:26 +0200 Subject: [PATCH 1/9] Draft script to find diverging links See https://github.com/Quansight-Labs/czi-scientific-python-mgmt/issues/88 Incomplete, in particular we should handle relative and anchor links, starting with #, and . $ python tools/divergent_links.py docs/_build/html --- tools/divergent_links.py | 87 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 tools/divergent_links.py diff --git a/tools/divergent_links.py b/tools/divergent_links.py new file mode 100644 index 000000000..7b7dbc40e --- /dev/null +++ b/tools/divergent_links.py @@ -0,0 +1,87 @@ +"""This script help checking divergent links. + +That is to say, links to the same page, +that have different titles. +""" + +import os +import sys +from collections import defaultdict + +from bs4 import BeautifulSoup + +ignores = ["#", "next", "previous"] + + +def find_html_files(folder_path): + """Find all html files in given folder.""" + html_files = [] + for root, dirs, files in os.walk(folder_path): + for file in files: + if file.endswith(".html"): + html_files.append(os.path.join(root, file)) + return html_files + + +class Checker: + """Link checker.""" + + links: dict[str, list] + + def __init__(self): + self.links = defaultdict(list) + + def scan(self, html_content, identifier): + """Scan given file for html links.""" + # Parse the HTML content using BeautifulSoup + soup = BeautifulSoup(html_content, "html.parser") + + # Dictionary to store URLs and their corresponding titles + + # Extract all anchor tags + for a_tag in soup.find_all("a", href=True): + url = a_tag["href"] + if url.startswith("#"): + continue + content = a_tag.text.strip().lower() + if content in ignores: + continue + if content.split("\n")[0] in ignores: + continue + + self.links[content].append((url, identifier)) + + def duplicates(self): + """Print potential duplicates.""" + for content, url_pages in self.links.items(): + uniq_url = {u for u, _ in url_pages} + if len(uniq_url) >= 2: + print(f"{content} has divergent url:") + for u, p in url_pages: + print(" ", u, "in", p) + + +# Example usage +data = """ + + + Visit Example + Check Example + Visit OpenAI + Learn about OpenAI + + +""" + +c = Checker() +# Call the function and print results +# inconsistencies = c.scan(data, "C0") + +print(sys.argv) + +for file in find_html_files(sys.argv[1]): + with open(file) as f: + data = f.read() + c.scan(data, file) + +c.duplicates() From 357b8e34286cf7f67e083315dec3909f32cc6e0e Mon Sep 17 00:00:00 2001 From: M Bussonnier Date: Thu, 12 Sep 2024 11:20:09 +0200 Subject: [PATCH 2/9] cleanup a few diverging links --- docs/community/topics/dependencies-js.md | 2 +- docs/community/topics/manual-dev.md | 4 ++-- docs/user_guide/accessibility.md | 2 +- docs/user_guide/indices.rst | 2 +- tools/divergent_links.py | 27 ++++++++++++++++++++---- 5 files changed, 28 insertions(+), 9 deletions(-) diff --git a/docs/community/topics/dependencies-js.md b/docs/community/topics/dependencies-js.md index 3887b360d..04bd0a830 100644 --- a/docs/community/topics/dependencies-js.md +++ b/docs/community/topics/dependencies-js.md @@ -8,7 +8,7 @@ There are two kinds of dependency definitions in this theme: To update or add a JS dependency, follow these steps: 1. **Edit `package.json`** by adding or modifying a dependency. -2. **Re-generate `package-lock.json`** in order to create a new set of frozen dependencies for the theme. To do this, run the following command from [the Sphinx Theme Builder](https://github.com/pradyunsg/sphinx-theme-builder). +2. **Re-generate `package-lock.json`** in order to create a new set of frozen dependencies for the theme. To do this, run the following command from [the Sphinx Theme Builder](https://sphinx-theme-builder.readthedocs.io/en/latest/). ``` stb npm install --include=dev diff --git a/docs/community/topics/manual-dev.md b/docs/community/topics/manual-dev.md index 5c952633a..c9e79675e 100644 --- a/docs/community/topics/manual-dev.md +++ b/docs/community/topics/manual-dev.md @@ -18,7 +18,7 @@ To do so, use a tool like [conda](https://docs.conda.io/en/latest/), [mamba](htt Before you start, ensure that you have the following installed: - Python >= 3.9 -- [Pandoc](https://pandoc.org/installing.html): we use `nbsphinx` to support notebook (.ipynb) files in the documentation, which requires [installing Pandoc](https://pandoc.org/installing.html) at a system level (or within a Conda environment). +- [Pandoc](https://pandoc.org/): we use `nbsphinx` to support notebook (`.ipynb`) files in the documentation, which requires [installing Pandoc](https://pandoc.org/installing.html) at a system level (or within a Conda environment). ## Clone the repository locally @@ -66,7 +66,7 @@ To manually open a server to watch your documentation for changes, build them, a $ stb serve docs --open-browser ``` -## Run the tests +## Manually Run the tests To manually run the tests for this theme, first set up your environment locally, and then run: diff --git a/docs/user_guide/accessibility.md b/docs/user_guide/accessibility.md index 01ca24329..3fe7d70a7 100644 --- a/docs/user_guide/accessibility.md +++ b/docs/user_guide/accessibility.md @@ -69,7 +69,7 @@ Site maps, usually served from a file called `sitemap.xml` are a broadly-employe approach to telling programs like search engines and assistive technologies where different content appears on a website. -If using a service like [ReadTheDocs](https://readthedocs.com), these files +If using a service like [ReadTheDocs](https://about.readthedocs.com/), these files will be created for you _automatically_, but for some other approaches below, it's handy to generate a `sitemap.xml` locally or in CI with a tool like [sphinx-sitemap](https://pypi.org/project/sphinx-sitemap/). diff --git a/docs/user_guide/indices.rst b/docs/user_guide/indices.rst index 6bbb9c279..cd1d29e83 100644 --- a/docs/user_guide/indices.rst +++ b/docs/user_guide/indices.rst @@ -19,4 +19,4 @@ By design the indices pages are not linked in a documentation generated with thi .. note:: - Don't forget to add back the ``"sidebar-ethical-ads.html"`` template if you are serving your documentation using `ReadTheDocs `__. + Don't forget to add back the ``"sidebar-ethical-ads.html"`` template if you are serving your documentation using `ReadTheDocs `__. diff --git a/tools/divergent_links.py b/tools/divergent_links.py index 7b7dbc40e..3cd5ceced 100644 --- a/tools/divergent_links.py +++ b/tools/divergent_links.py @@ -10,7 +10,17 @@ from bs4 import BeautifulSoup -ignores = ["#", "next", "previous"] +ignores = [ + "#", + "next", + "previous", + "[source]", + "edit on github", + "[docs]", + "read more ...", + "show source", + "module", +] def find_html_files(folder_path): @@ -48,17 +58,26 @@ def scan(self, html_content, identifier): continue if content.split("\n")[0] in ignores: continue + from urllib.parse import urljoin - self.links[content].append((url, identifier)) + fullurl = urljoin(identifier, url) + self.links[content].append((fullurl, identifier)) def duplicates(self): """Print potential duplicates.""" for content, url_pages in self.links.items(): uniq_url = {u for u, _ in url_pages} if len(uniq_url) >= 2: - print(f"{content} has divergent url:") + print( + f"{len(url_pages)} time {content!r} has {len(uniq_url)} on divergent url on :" + ) + dct = defaultdict(list) for u, p in url_pages: - print(" ", u, "in", p) + dct[u].append(p) + for u, ps in dct.items(): + print(" ", u, "in") + for p in ps: + print(" ", p) # Example usage From 8212b3bed52b666dd03c3a6fb81fe07472dfa3c9 Mon Sep 17 00:00:00 2001 From: M Bussonnier Date: Mon, 30 Sep 2024 01:10:17 -0700 Subject: [PATCH 3/9] Apply suggestions from code review Co-authored-by: gabalafou --- docs/community/topics/dependencies-js.md | 2 +- docs/community/topics/manual-dev.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/community/topics/dependencies-js.md b/docs/community/topics/dependencies-js.md index 04bd0a830..2e9979c13 100644 --- a/docs/community/topics/dependencies-js.md +++ b/docs/community/topics/dependencies-js.md @@ -8,7 +8,7 @@ There are two kinds of dependency definitions in this theme: To update or add a JS dependency, follow these steps: 1. **Edit `package.json`** by adding or modifying a dependency. -2. **Re-generate `package-lock.json`** in order to create a new set of frozen dependencies for the theme. To do this, run the following command from [the Sphinx Theme Builder](https://sphinx-theme-builder.readthedocs.io/en/latest/). +2. **Re-generate `package-lock.json`** in order to create a new set of frozen dependencies for the theme. To do this, run the following command from the [Sphinx Theme Builder](https://sphinx-theme-builder.readthedocs.io/en/latest/). ``` stb npm install --include=dev diff --git a/docs/community/topics/manual-dev.md b/docs/community/topics/manual-dev.md index c9e79675e..6bf548897 100644 --- a/docs/community/topics/manual-dev.md +++ b/docs/community/topics/manual-dev.md @@ -66,7 +66,7 @@ To manually open a server to watch your documentation for changes, build them, a $ stb serve docs --open-browser ``` -## Manually Run the tests +## Manually run the tests To manually run the tests for this theme, first set up your environment locally, and then run: From bd96c77159c1e448369a39bf5432474b17eb79f1 Mon Sep 17 00:00:00 2001 From: M Bussonnier Date: Mon, 30 Sep 2024 11:01:30 +0200 Subject: [PATCH 4/9] take reviews into accounts --- tools/divergent_links.py | 81 ++++++++++++++++++++++++++-------------- 1 file changed, 54 insertions(+), 27 deletions(-) diff --git a/tools/divergent_links.py b/tools/divergent_links.py index 3cd5ceced..1482bfe26 100644 --- a/tools/divergent_links.py +++ b/tools/divergent_links.py @@ -1,15 +1,32 @@ -"""This script help checking divergent links. +"""This script help checking inconsistent links. -That is to say, links to the same page, -that have different titles. +That is to say, links that have the same title but link to the same place. +This is useful for screen-reader and accessibility devices, where the user may +say "Go to X", but is there are 2 links X this can be confusing. + + +Example (links that have the same name, but different URL): + + We have a JavaScript API and + a Python API. + +How to fix (give the links different names): + + We have a JavaScript API and + a Python API. """ import os import sys from collections import defaultdict +from urllib.parse import urljoin from bs4 import BeautifulSoup +# when looking at inconsistent links across pages, +# a number of text is recurrent and appear on many pages. +# So we'll ignore these. + ignores = [ "#", "next", @@ -41,7 +58,7 @@ class Checker: def __init__(self): self.links = defaultdict(list) - def scan(self, html_content, identifier): + def scan(self, html_content, file_path): """Scan given file for html links.""" # Parse the HTML content using BeautifulSoup soup = BeautifulSoup(html_content, "html.parser") @@ -51,17 +68,21 @@ def scan(self, html_content, identifier): # Extract all anchor tags for a_tag in soup.find_all("a", href=True): url = a_tag["href"] + + # These are usually link into the same page ("see below", or even + # header anchors we thus exclude those. if url.startswith("#"): continue content = a_tag.text.strip().lower() if content in ignores: continue + # Some links are "$Title\nNext", or "$Title\nprev", so we only + # want to look at what is before the `\n` if content.split("\n")[0] in ignores: continue - from urllib.parse import urljoin - fullurl = urljoin(identifier, url) - self.links[content].append((fullurl, identifier)) + fullurl = urljoin(file_path, url) + self.links[content].append((fullurl, file_path)) def duplicates(self): """Print potential duplicates.""" @@ -80,27 +101,33 @@ def duplicates(self): print(" ", p) -# Example usage -data = """ - - - Visit Example - Check Example - Visit OpenAI - Learn about OpenAI - - -""" +if len(sys.argv) == 3 and sys.argv[2] == "--all": + c = Checker() + + for file in find_html_files(sys.argv[1]): + with open(file) as f: + data = f.read() + c.scan(data, file) -c = Checker() -# Call the function and print results -# inconsistencies = c.scan(data, "C0") + c.duplicates() +elif len(sys.argv) == 2: + for file in find_html_files(sys.argv[1]): + with open(file) as f: + data = f.read() + c = Checker() + c.scan(data, file) + c.duplicates() +else: + print( + """ +Check each page individually for incoherent links -print(sys.argv) + python tools/divergent_links.py docs/_build/html/ -for file in find_html_files(sys.argv[1]): - with open(file) as f: - data = f.read() - c.scan(data, file) +Check all pages for global (and local) incoherent links -c.duplicates() + python tools/divergent_links.py docs/_build/html/ --all + +""" + ) + sys.exit(1) From bff2591ea4cb1e6cd33f7e83bc3ed8a51f273201 Mon Sep 17 00:00:00 2001 From: M Bussonnier Date: Thu, 3 Oct 2024 17:08:32 +0200 Subject: [PATCH 5/9] Apply suggestions from code review Co-authored-by: gabalafou --- tools/divergent_links.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tools/divergent_links.py b/tools/divergent_links.py index 1482bfe26..0b788b689 100644 --- a/tools/divergent_links.py +++ b/tools/divergent_links.py @@ -1,8 +1,8 @@ """This script help checking inconsistent links. -That is to say, links that have the same title but link to the same place. +That is to say, links that have the same title but go to different places. This is useful for screen-reader and accessibility devices, where the user may -say "Go to X", but is there are 2 links X this can be confusing. +say "Go to X", but if there are 2 links named "X" this creates ambiguity. Example (links that have the same name, but different URL): @@ -120,11 +120,13 @@ def duplicates(self): else: print( """ -Check each page individually for incoherent links +Check page-wise link consistency +(links with the same name on the same page should go to the same URL) python tools/divergent_links.py docs/_build/html/ -Check all pages for global (and local) incoherent links +Check site-wide link consistency +(links with the same name across all pages should go the same URL) python tools/divergent_links.py docs/_build/html/ --all From 9bc541c2c8484d7ccd44a9e40ece875d7c121067 Mon Sep 17 00:00:00 2001 From: M Bussonnier Date: Thu, 3 Oct 2024 17:15:21 +0200 Subject: [PATCH 6/9] trailing whitespaces --- docs/community/practices/versions.md | 1 - docs/user_guide/theme-elements.md | 3 --- tools/divergent_links.py | 4 ++-- 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/docs/community/practices/versions.md b/docs/community/practices/versions.md index b52bcd189..18d7a7364 100644 --- a/docs/community/practices/versions.md +++ b/docs/community/practices/versions.md @@ -14,7 +14,6 @@ We define "support" as testing against each of these versions so that users can For example, if we made a minor release tomorrow, we'd [look at the EOL schedule for Python](https://endoflife.date/python) and support all the versions that fall within a 3.5-year window. [^1]: Our support for Python versions is inspired by [NEP 029](https://numpy.org/neps/nep-0029-deprecation_policy.html). - [^2]: These policies are goals, not promises. We are a volunteer-led community with limited time. Consider these sections to be our intention, but we recognize that we may not always be able to meet these criteria if we cannot do so. We welcome contributions from others to help us more sustainably meet these goals! ## Supported Sphinx versions diff --git a/docs/user_guide/theme-elements.md b/docs/user_guide/theme-elements.md index 3e54a750d..464f5cd57 100644 --- a/docs/user_guide/theme-elements.md +++ b/docs/user_guide/theme-elements.md @@ -203,11 +203,8 @@ Here's a numeric footnote[^1], another one (preceded by a space) [^2], a named f All will end up as numbers in the rendered HTML, but in the source they look like `[^1]`, `[^2]`, `[^named]` and `[^*]`. [^1]: Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. - [^2]: Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. - [^named]: Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. - [^*]: Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. ## Link shortening for git repository services diff --git a/tools/divergent_links.py b/tools/divergent_links.py index 0b788b689..ea00b83eb 100644 --- a/tools/divergent_links.py +++ b/tools/divergent_links.py @@ -120,12 +120,12 @@ def duplicates(self): else: print( """ -Check page-wise link consistency +Check page-wise link consistency (links with the same name on the same page should go to the same URL) python tools/divergent_links.py docs/_build/html/ -Check site-wide link consistency +Check site-wide link consistency (links with the same name across all pages should go the same URL) python tools/divergent_links.py docs/_build/html/ --all From 1278dfe9b4a62b31e56391244fe1ddd931a99dc2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 3 Oct 2024 15:15:46 +0000 Subject: [PATCH 7/9] [pre-commit.ci] Automatic linting and formatting fixes --- docs/community/practices/versions.md | 1 + docs/user_guide/theme-elements.md | 3 +++ 2 files changed, 4 insertions(+) diff --git a/docs/community/practices/versions.md b/docs/community/practices/versions.md index 18d7a7364..b52bcd189 100644 --- a/docs/community/practices/versions.md +++ b/docs/community/practices/versions.md @@ -14,6 +14,7 @@ We define "support" as testing against each of these versions so that users can For example, if we made a minor release tomorrow, we'd [look at the EOL schedule for Python](https://endoflife.date/python) and support all the versions that fall within a 3.5-year window. [^1]: Our support for Python versions is inspired by [NEP 029](https://numpy.org/neps/nep-0029-deprecation_policy.html). + [^2]: These policies are goals, not promises. We are a volunteer-led community with limited time. Consider these sections to be our intention, but we recognize that we may not always be able to meet these criteria if we cannot do so. We welcome contributions from others to help us more sustainably meet these goals! ## Supported Sphinx versions diff --git a/docs/user_guide/theme-elements.md b/docs/user_guide/theme-elements.md index 464f5cd57..3e54a750d 100644 --- a/docs/user_guide/theme-elements.md +++ b/docs/user_guide/theme-elements.md @@ -203,8 +203,11 @@ Here's a numeric footnote[^1], another one (preceded by a space) [^2], a named f All will end up as numbers in the rendered HTML, but in the source they look like `[^1]`, `[^2]`, `[^named]` and `[^*]`. [^1]: Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. + [^2]: Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. + [^named]: Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. + [^*]: Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. Foo bar foo bar. ## Link shortening for git repository services From 20f93bf525cce052c22a038f41e03b2b11fdb24c Mon Sep 17 00:00:00 2001 From: M Bussonnier Date: Fri, 4 Oct 2024 01:18:09 -0700 Subject: [PATCH 8/9] Update tools/divergent_links.py Co-authored-by: Daniel McCloy --- tools/divergent_links.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/divergent_links.py b/tools/divergent_links.py index ea00b83eb..6dd84ff2c 100644 --- a/tools/divergent_links.py +++ b/tools/divergent_links.py @@ -90,7 +90,8 @@ def duplicates(self): uniq_url = {u for u, _ in url_pages} if len(uniq_url) >= 2: print( - f"{len(url_pages)} time {content!r} has {len(uniq_url)} on divergent url on :" + f'The link text "{content!r}" appears {len(url_pages)} times, ' + f'and links to {len(uniq_url)} different URLs, on the following pages:' ) dct = defaultdict(list) for u, p in url_pages: From 1ce920d7dfac5f38f9b67ffe5a4251433a871468 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 4 Oct 2024 08:18:29 +0000 Subject: [PATCH 9/9] [pre-commit.ci] Automatic linting and formatting fixes --- tools/divergent_links.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/divergent_links.py b/tools/divergent_links.py index 6dd84ff2c..ed78d7892 100644 --- a/tools/divergent_links.py +++ b/tools/divergent_links.py @@ -90,8 +90,8 @@ def duplicates(self): uniq_url = {u for u, _ in url_pages} if len(uniq_url) >= 2: print( - f'The link text "{content!r}" appears {len(url_pages)} times, ' - f'and links to {len(uniq_url)} different URLs, on the following pages:' + f'The link text "{content!r}" appears {len(url_pages)} times, ' + f"and links to {len(uniq_url)} different URLs, on the following pages:" ) dct = defaultdict(list) for u, p in url_pages: