Skip to content

Commit 9898039

Browse files
Siyuan Fengtqchen
authored andcommitted
[docs] Download 3rd party embeds to local files
Add a script to download 3rd party embedded resources (CSS, JS, etc) to local files. This ensures that the documentation can be viewed offline and reduces external dependencies. The script: 1. Detects external URLs in HTML/CSS files 2. Downloads the resources to _static/downloads 3. Updates the HTML/CSS files to use local copies
1 parent 9404fb5 commit 9898039

5 files changed

Lines changed: 319 additions & 274 deletions

File tree

docs/Makefile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,10 @@ html: staging
111111
@echo
112112
@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
113113

114+
htmldepoly: html
115+
python3 $(PWD)/download_3rdparty_embeds.py -v
116+
@echo "Replaced external URLs with local files."
117+
114118
dirhtml: staging
115119
cd $(STAGINGDIR) && $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(PWD)/$(BUILDDIR)/dirhtml
116120
@echo

docs/download_3rdparty_embeds.py

Lines changed: 314 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,314 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
# pylint: disable=redefined-outer-name, missing-module-docstring
19+
import argparse
20+
import hashlib
21+
import os
22+
import re
23+
from html.parser import HTMLParser
24+
from typing import Callable, Dict, List, Optional, Set, Tuple, Union
25+
from urllib.parse import urlparse
26+
27+
import requests
28+
29+
# NOTE: This script is called by the Makefile via `make htmldepoly`.
30+
# It's not called every time the docs are built on CI. However, it's
31+
# can be only called during deployment stage, instead of building the docs.
32+
# Also, we can download the resources manually before running this script to
33+
# avoid the overhead of downloading the resources every time the docs are built.
34+
35+
# Set to store unique external URLs found during processing
36+
BASE_URL = "https://tvm.apache.org"
37+
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
38+
HTML_DIR = os.path.join(SCRIPT_DIR, "_build/html")
39+
40+
41+
class ExternalURLParser(HTMLParser):
42+
"""HTML Parser to find external URLs in HTML content."""
43+
44+
def __init__(self):
45+
super().__init__()
46+
self.external_urls: List[str] = []
47+
self.base_domain = urlparse(BASE_URL).netloc
48+
# Tags and their attributes that might contain external resources
49+
self.tags_to_check = {
50+
"img": "src",
51+
"script": "src",
52+
"iframe": "src",
53+
"video": "src",
54+
"audio": "src",
55+
"link": "href",
56+
"source": "src",
57+
"embed": "src",
58+
}
59+
60+
def handle_starttag(self, tag: str, attrs: List[tuple[str, Union[str, None]]]) -> None:
61+
"""Handle HTML start tags to find external URLs."""
62+
if tag not in self.tags_to_check:
63+
return
64+
65+
attr_name = self.tags_to_check[tag]
66+
for name, value in attrs:
67+
if name != attr_name or not value:
68+
continue
69+
70+
if value.startswith(("http://", "https://")):
71+
domain = urlparse(value).netloc
72+
if domain and domain != self.base_domain:
73+
self.external_urls.append(value)
74+
75+
76+
def detect_html_external_urls(html_content: str) -> List[str]:
77+
"""
78+
Detect third-party embedded resources in HTML content.
79+
80+
Parameters
81+
----------
82+
html_content : str
83+
The HTML content to analyze
84+
85+
Returns
86+
-------
87+
List[str]
88+
List of external URLs found in the HTML content
89+
"""
90+
parser = ExternalURLParser()
91+
parser.feed(html_content)
92+
return parser.external_urls
93+
94+
95+
def detect_css_external_urls(css_content: str) -> List[str]:
96+
"""
97+
Detect external URLs in CSS content.
98+
99+
Parameters
100+
----------
101+
css_content : str
102+
The CSS content to analyze
103+
104+
Returns
105+
-------
106+
List[str]
107+
List of external URLs found in the CSS content
108+
"""
109+
external_urls: List[str] = []
110+
# Regex to find URLs in CSS
111+
url_pattern = re.compile(r'url\(["\']?(.*?)["\']?\)')
112+
matches = url_pattern.findall(css_content)
113+
for match in matches:
114+
if match.startswith(("http://", "https://")) and not match.startswith(BASE_URL):
115+
external_urls.append(match)
116+
return external_urls
117+
118+
119+
def all_files_in_dir(path: str) -> List[str]:
120+
"""
121+
Get a list of all files in a directory and its subdirectories.
122+
123+
Parameters
124+
----------
125+
path : str
126+
The root directory path to search
127+
128+
Returns
129+
-------
130+
List[str]
131+
List of full paths to all files found
132+
"""
133+
return [os.path.join(root, file) for root, _, files in os.walk(path) for file in files]
134+
135+
136+
def detect_urls(files: List[str], verbose: bool = False) -> List[str]:
137+
"""
138+
Detect external URLs in the given HTML and CSS files.
139+
140+
Parameters
141+
----------
142+
files : List[str]
143+
List of file paths to check for external URLs
144+
verbose : bool, optional
145+
Whether to print verbose output, by default False
146+
147+
Returns
148+
-------
149+
List[str]
150+
List of external URLs found in the files
151+
"""
152+
153+
external_urls: Set[str] = set()
154+
for file in files:
155+
f_detect: Union[Callable[[str, str], List[str]], None] = None
156+
if file.endswith(".html"):
157+
f_detect = detect_html_external_urls
158+
elif file.endswith(".css"):
159+
f_detect = detect_css_external_urls
160+
else:
161+
continue
162+
with open(file, "r", encoding="utf-8") as f:
163+
content = f.read()
164+
urls = f_detect(content)
165+
if verbose:
166+
print(f"Processing {file}")
167+
exist_urls, new_urls = 0, 0
168+
for url in urls:
169+
if url in external_urls:
170+
exist_urls += 1
171+
else:
172+
new_urls += 1
173+
if verbose:
174+
print(f"Found new {url}")
175+
print(f"Found {exist_urls} existing resources and {new_urls} new resources")
176+
external_urls.update(urls)
177+
if verbose:
178+
print(f"Total {len(external_urls)} external resources")
179+
print("External resources:")
180+
print("\n".join(external_urls))
181+
182+
return list(external_urls)
183+
184+
185+
def download_external_urls(
186+
external_urls: List[str], verbose: bool = False
187+
) -> Tuple[Dict[str, str], List[str]]:
188+
"""
189+
Download external URLs and save them to docs/_static/downloads.
190+
191+
Parameters
192+
----------
193+
external_urls : List[str]
194+
List of external URLs to download
195+
verbose : bool, optional
196+
Whether to print verbose output, by default False
197+
198+
Returns
199+
-------
200+
Tuple[Dict[str, str], List[str]]
201+
A tuple containing:
202+
- Dictionary mapping original URLs to their downloaded file paths
203+
- List of paths to all downloaded files (including source maps)
204+
"""
205+
download_dir = os.path.join(HTML_DIR, "_static/downloads")
206+
os.makedirs(download_dir, exist_ok=True)
207+
used_file_names: Set[str] = set()
208+
downloaded_files: List[str] = []
209+
remap_urls: Dict[str, str] = {}
210+
for url in external_urls:
211+
query = urlparse(url).query
212+
if url.startswith("https://fonts.googleapis.com/css2"):
213+
file_name = f"{hashlib.md5(url.encode()).hexdigest()}.css"
214+
elif query:
215+
raise ValueError(f"Unsupported URL with query: {url}")
216+
else:
217+
file_name = urlparse(url).path.split("/")[-1]
218+
if verbose:
219+
print(f"remapping {url} to {file_name}")
220+
if file_name in used_file_names:
221+
raise ValueError(f"File name {file_name} already exists")
222+
used_file_names.add(file_name)
223+
response = requests.get(url, timeout=30)
224+
body = response.content
225+
with open(os.path.join(download_dir, file_name), "wb") as f:
226+
f.write(body)
227+
remap_urls[url] = os.path.join(download_dir, file_name)
228+
downloaded_files.append(os.path.join(download_dir, file_name))
229+
230+
# Also download the sourceMappingURL
231+
if not url.startswith("https://fonts.googleapis.com/css2"):
232+
map_file_name = f"{file_name}.map"
233+
response = requests.get(f"{url}.map", timeout=30)
234+
if response.status_code == 200:
235+
body = response.content
236+
with open(os.path.join(download_dir, map_file_name), "wb") as f:
237+
f.write(body)
238+
if verbose:
239+
print(f"Downloaded {map_file_name} for {url}")
240+
downloaded_files.append(os.path.join(download_dir, map_file_name))
241+
242+
return remap_urls, downloaded_files
243+
244+
245+
def replace_urls_in_files(remap_urls: Dict[str, str], verbose: bool = False):
246+
"""
247+
Replace external URLs with their downloaded versions in HTML/CSS files.
248+
249+
Parameters
250+
----------
251+
remap_urls : Dict[str, str]
252+
Dictionary mapping original URLs to their downloaded file paths
253+
verbose : bool, optional
254+
Whether to print verbose output, by default False
255+
"""
256+
for root, _, files in os.walk(HTML_DIR):
257+
for file in files:
258+
if not (file.endswith(".html") or file.endswith(".css")):
259+
continue
260+
261+
file_path = os.path.join(root, file)
262+
if verbose:
263+
print(f"Processing {file_path}")
264+
265+
# Calculate relative path from current file to _static/downloads
266+
rel_path = os.path.relpath(
267+
os.path.join(HTML_DIR, "_static/downloads"), os.path.dirname(file_path)
268+
)
269+
270+
with open(file_path, "r", encoding="utf-8") as f:
271+
content = f.read()
272+
273+
new_content = content
274+
for original_url, new_path in remap_urls.items():
275+
relative_url = os.path.join(rel_path, os.path.basename(new_path))
276+
new_content = new_content.replace(original_url, relative_url)
277+
278+
if new_content != content:
279+
with open(file_path, "w", encoding="utf-8") as f:
280+
f.write(new_content)
281+
if verbose:
282+
print(f"Updated {file_path}")
283+
284+
285+
def download_and_replace_urls(files: Optional[List[str]] = None, verbose: bool = False):
286+
"""
287+
Download external URLs found in files and replace them with local copies.
288+
Recursively processes any new external URLs found in downloaded content.
289+
290+
Parameters
291+
----------
292+
files : Optional[List[str]], optional
293+
List of files to check for external URLs. If None, checks all files under HTML_DIR
294+
verbose : bool, optional
295+
Whether to print verbose output, by default False
296+
"""
297+
if files is None:
298+
files = all_files_in_dir(HTML_DIR)
299+
remap_urls = {}
300+
while True:
301+
external_urls = detect_urls(files, verbose=verbose)
302+
if not external_urls:
303+
break
304+
round_remap_urls, files = download_external_urls(external_urls, verbose=verbose)
305+
remap_urls.update(round_remap_urls)
306+
307+
replace_urls_in_files(remap_urls, verbose=verbose)
308+
309+
310+
if __name__ == "__main__":
311+
args = argparse.ArgumentParser()
312+
args.add_argument("-v", "--verbose", action="store_true")
313+
args = args.parse_args()
314+
download_and_replace_urls(verbose=args.verbose)

0 commit comments

Comments
 (0)