Skip to content

Commit 4a40449

Browse files
author
Siyuan Feng
committed
[docs] Download 3rd party embeds to local files
Add a script to download 3rd party embedded resources (CSS, JS, etc) to local files. This ensures that the documentation can be viewed offline and reduces external dependencies. The script: 1. Detects external URLs in HTML/CSS files 2. Downloads the resources to _static/downloads 3. Updates the HTML/CSS files to use local copies
1 parent a65d3d4 commit 4a40449

4 files changed

Lines changed: 315 additions & 273 deletions

File tree

docs/Makefile

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,10 +106,17 @@ staging:
106106
ln -s $(PWD)/_static $(STAGINGDIR)/_static
107107

108108

109+
htmllocal: staging
110+
cd $(STAGINGDIR) && $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(PWD)/$(BUILDDIR)/html
111+
@echo
112+
@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
113+
109114
html: staging
110115
cd $(STAGINGDIR) && $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(PWD)/$(BUILDDIR)/html
111116
@echo
112117
@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
118+
python3 $(PWD)/download_3rdparty_embeds.py -v
119+
@echo "Replaced external URLs with local files."
113120

114121
dirhtml: staging
115122
cd $(STAGINGDIR) && $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(PWD)/$(BUILDDIR)/dirhtml

docs/download_3rdparty_embeds.py

Lines changed: 308 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,308 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
# pylint: disable=redefined-outer-name, missing-module-docstring
19+
import argparse
20+
import hashlib
21+
import os
22+
import re
23+
from html.parser import HTMLParser
24+
from typing import Callable, Dict, List, Optional, Set, Union, Tuple
25+
from urllib.parse import urlparse
26+
27+
import requests
28+
29+
# Set to store unique external URLs found during processing
30+
BASE_URL = "https://tvm.apache.org"
31+
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
32+
HTML_DIR = os.path.join(SCRIPT_DIR, "_build/html")
33+
34+
35+
class ExternalURLParser(HTMLParser):
36+
"""HTML Parser to find external URLs in HTML content."""
37+
38+
def __init__(self):
39+
super().__init__()
40+
self.external_urls: List[str] = []
41+
self.base_domain = urlparse(BASE_URL).netloc
42+
# Tags and their attributes that might contain external resources
43+
self.tags_to_check = {
44+
"img": "src",
45+
"script": "src",
46+
"iframe": "src",
47+
"video": "src",
48+
"audio": "src",
49+
"link": "href",
50+
"source": "src",
51+
"embed": "src",
52+
}
53+
54+
def handle_starttag(self, tag: str, attrs: List[tuple[str, str | None]]) -> None:
55+
"""Handle HTML start tags to find external URLs."""
56+
if tag not in self.tags_to_check:
57+
return
58+
59+
attr_name = self.tags_to_check[tag]
60+
for name, value in attrs:
61+
if name != attr_name or not value:
62+
continue
63+
64+
if value.startswith(("http://", "https://")):
65+
domain = urlparse(value).netloc
66+
if domain and domain != self.base_domain:
67+
self.external_urls.append(value)
68+
69+
70+
def detect_html_external_urls(html_content: str) -> List[str]:
71+
"""
72+
Detect third-party embedded resources in HTML content.
73+
74+
Parameters
75+
----------
76+
html_content : str
77+
The HTML content to analyze
78+
79+
Returns
80+
-------
81+
List[str]
82+
List of external URLs found in the HTML content
83+
"""
84+
parser = ExternalURLParser()
85+
parser.feed(html_content)
86+
return parser.external_urls
87+
88+
89+
def detect_css_external_urls(css_content: str) -> List[str]:
90+
"""
91+
Detect external URLs in CSS content.
92+
93+
Parameters
94+
----------
95+
css_content : str
96+
The CSS content to analyze
97+
98+
Returns
99+
-------
100+
List[str]
101+
List of external URLs found in the CSS content
102+
"""
103+
external_urls: List[str] = []
104+
# Regex to find URLs in CSS
105+
url_pattern = re.compile(r'url\(["\']?(.*?)["\']?\)')
106+
matches = url_pattern.findall(css_content)
107+
for match in matches:
108+
if match.startswith(("http://", "https://")) and not match.startswith(BASE_URL):
109+
external_urls.append(match)
110+
return external_urls
111+
112+
113+
def all_files_in_dir(path: str) -> List[str]:
114+
"""
115+
Get a list of all files in a directory and its subdirectories.
116+
117+
Parameters
118+
----------
119+
path : str
120+
The root directory path to search
121+
122+
Returns
123+
-------
124+
List[str]
125+
List of full paths to all files found
126+
"""
127+
return [os.path.join(root, file) for root, _, files in os.walk(path) for file in files]
128+
129+
130+
def detect_urls(files: List[str], verbose: bool = False) -> List[str]:
131+
"""
132+
Detect external URLs in the given HTML and CSS files.
133+
134+
Parameters
135+
----------
136+
files : List[str]
137+
List of file paths to check for external URLs
138+
verbose : bool, optional
139+
Whether to print verbose output, by default False
140+
141+
Returns
142+
-------
143+
List[str]
144+
List of external URLs found in the files
145+
"""
146+
147+
external_urls: Set[str] = set()
148+
for file in files:
149+
f_detect: Union[Callable[[str, str], List[str]], None] = None
150+
if file.endswith(".html"):
151+
f_detect = detect_html_external_urls
152+
elif file.endswith(".css"):
153+
f_detect = detect_css_external_urls
154+
else:
155+
continue
156+
with open(file, "r", encoding="utf-8") as f:
157+
content = f.read()
158+
urls = f_detect(content)
159+
if verbose:
160+
print(f"Processing {file}")
161+
exist_urls, new_urls = 0, 0
162+
for url in urls:
163+
if url in external_urls:
164+
exist_urls += 1
165+
else:
166+
new_urls += 1
167+
if verbose:
168+
print(f"Found new {url}")
169+
print(f"Found {exist_urls} existing resources and {new_urls} new resources")
170+
external_urls.update(urls)
171+
if verbose:
172+
print(f"Total {len(external_urls)} external resources")
173+
print("External resources:")
174+
print("\n".join(external_urls))
175+
176+
return list(external_urls)
177+
178+
179+
def download_external_urls(
180+
external_urls: List[str], verbose: bool = False
181+
) -> Tuple[Dict[str, str], List[str]]:
182+
"""
183+
Download external URLs and save them to docs/_static/downloads.
184+
185+
Parameters
186+
----------
187+
external_urls : List[str]
188+
List of external URLs to download
189+
verbose : bool, optional
190+
Whether to print verbose output, by default False
191+
192+
Returns
193+
-------
194+
Tuple[Dict[str, str], List[str]]
195+
A tuple containing:
196+
- Dictionary mapping original URLs to their downloaded file paths
197+
- List of paths to all downloaded files (including source maps)
198+
"""
199+
download_dir = os.path.join(HTML_DIR, "_static/downloads")
200+
os.makedirs(download_dir, exist_ok=True)
201+
used_file_names: Set[str] = set()
202+
downloaded_files: List[str] = []
203+
remap_urls: Dict[str, str] = {}
204+
for url in external_urls:
205+
query = urlparse(url).query
206+
if url.startswith("https://fonts.googleapis.com/css2"):
207+
file_name = f"{hashlib.md5(url.encode()).hexdigest()}.css"
208+
elif query:
209+
raise ValueError(f"Unsupported URL with query: {url}")
210+
else:
211+
file_name = urlparse(url).path.split("/")[-1]
212+
if verbose:
213+
print(f"remapping {url} to {file_name}")
214+
if file_name in used_file_names:
215+
raise ValueError(f"File name {file_name} already exists")
216+
used_file_names.add(file_name)
217+
response = requests.get(url, timeout=30)
218+
body = response.content
219+
with open(os.path.join(download_dir, file_name), "wb") as f:
220+
f.write(body)
221+
remap_urls[url] = os.path.join(download_dir, file_name)
222+
downloaded_files.append(os.path.join(download_dir, file_name))
223+
224+
# Also download the sourceMappingURL
225+
if not url.startswith("https://fonts.googleapis.com/css2"):
226+
map_file_name = f"{file_name}.map"
227+
response = requests.get(f"{url}.map", timeout=30)
228+
if response.status_code == 200:
229+
body = response.content
230+
with open(os.path.join(download_dir, map_file_name), "wb") as f:
231+
f.write(body)
232+
if verbose:
233+
print(f"Downloaded {map_file_name} for {url}")
234+
downloaded_files.append(os.path.join(download_dir, map_file_name))
235+
236+
return remap_urls, downloaded_files
237+
238+
239+
def replace_urls_in_files(remap_urls: Dict[str, str], verbose: bool = False):
240+
"""
241+
Replace external URLs with their downloaded versions in HTML/CSS files.
242+
243+
Parameters
244+
----------
245+
remap_urls : Dict[str, str]
246+
Dictionary mapping original URLs to their downloaded file paths
247+
verbose : bool, optional
248+
Whether to print verbose output, by default False
249+
"""
250+
for root, _, files in os.walk(HTML_DIR):
251+
for file in files:
252+
if not (file.endswith(".html") or file.endswith(".css")):
253+
continue
254+
255+
file_path = os.path.join(root, file)
256+
if verbose:
257+
print(f"Processing {file_path}")
258+
259+
# Calculate relative path from current file to _static/downloads
260+
rel_path = os.path.relpath(
261+
os.path.join(HTML_DIR, "_static/downloads"), os.path.dirname(file_path)
262+
)
263+
264+
with open(file_path, "r", encoding="utf-8") as f:
265+
content = f.read()
266+
267+
new_content = content
268+
for original_url, new_path in remap_urls.items():
269+
relative_url = os.path.join(rel_path, os.path.basename(new_path))
270+
new_content = new_content.replace(original_url, relative_url)
271+
272+
if new_content != content:
273+
with open(file_path, "w", encoding="utf-8") as f:
274+
f.write(new_content)
275+
if verbose:
276+
print(f"Updated {file_path}")
277+
278+
279+
def download_and_replace_urls(files: Optional[List[str]] = None, verbose: bool = False):
280+
"""
281+
Download external URLs found in files and replace them with local copies.
282+
Recursively processes any new external URLs found in downloaded content.
283+
284+
Parameters
285+
----------
286+
files : Optional[List[str]], optional
287+
List of files to check for external URLs. If None, checks all files under HTML_DIR
288+
verbose : bool, optional
289+
Whether to print verbose output, by default False
290+
"""
291+
if files is None:
292+
files = all_files_in_dir(HTML_DIR)
293+
remap_urls = {}
294+
while True:
295+
external_urls = detect_urls(files, verbose=verbose)
296+
if not external_urls:
297+
break
298+
round_remap_urls, files = download_external_urls(external_urls, verbose=verbose)
299+
remap_urls.update(round_remap_urls)
300+
301+
replace_urls_in_files(remap_urls, verbose=verbose)
302+
303+
304+
if __name__ == "__main__":
305+
args = argparse.ArgumentParser()
306+
args.add_argument("-v", "--verbose", action="store_true")
307+
args = args.parse_args()
308+
download_and_replace_urls(verbose=args.verbose)

0 commit comments

Comments
 (0)