Skip to content

Commit 2efb30e

Browse files
committed
Use a queue when doing OCR
This allows to run the OCR at the same time we do the conversion, and using all the available CPU. This results in a x2.4 speedup on my quite beefy machine (AMD Ryzen, 16 CPUs) Fixes #1329 Before this change (`time`, on a 220 pages PDF, with OCR): ``` ________________________________________________________ Executed in 44.78 secs fish external usr time 622.67 secs 608.00 micros 622.66 secs sys time 2.94 secs 62.00 micros 2.94 secs ``` After this change: ``` ________________________________________________________ Executed in 274.08 secs fish external usr time 271.77 secs 1.06 millis 271.77 secs sys time 0.79 secs 0.01 millis 0.79 secs ```
1 parent 9d190bd commit 2efb30e

2 files changed

Lines changed: 109 additions & 37 deletions

File tree

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ since 0.4.1, and this project adheres to [Semantic Versioning](https://semver.or
1717

1818
- Dangerzone is able to function without a bundled `container.tar` file
1919
([#1400](https://github.com/freedomofpress/dangerzone/pull/1400))
20+
- OCR tasks are now queued, resulting in up to x2.4 speedup
21+
([#1329](https://github.com/freedomofpress/dangerzone/issues/1329))
2022

2123

2224
### Development changes

dangerzone/isolation_provider/base.py

Lines changed: 107 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
import contextlib
22
import logging
3+
import multiprocessing as mp
34
import os
45
import platform
6+
import queue
57
import signal
68
import subprocess
79
import sys
@@ -79,6 +81,30 @@ def sanitize_debug_text(text: bytes) -> str:
7981
return replace_control_chars(untrusted_text, keep_newlines=True)
8082

8183

84+
def _ocr_page_worker(
85+
pixmap_bytes: bytes, width: int, height: int, ocr_lang: str, tessdata_dir: str
86+
) -> bytes:
87+
"""Worker function for multiprocessing OCR. Returns PDF bytes."""
88+
try:
89+
pixmap = fitz.Pixmap(
90+
fitz.Colorspace(fitz.CS_RGB),
91+
width,
92+
height,
93+
pixmap_bytes,
94+
False,
95+
)
96+
pixmap.set_dpi(DEFAULT_DPI, DEFAULT_DPI)
97+
return pixmap.pdfocr_tobytes(
98+
compress=True,
99+
language=ocr_lang,
100+
tessdata=tessdata_dir,
101+
)
102+
except Exception as e:
103+
# Re-raise with a picklable exception to avoid multiprocessing errors
104+
# when the original exception contains unpicklable SWIG objects
105+
raise RuntimeError(str(e)) from None
106+
107+
82108
class IsolationProvider(ABC):
83109
"""
84110
Abstracts an isolation provider
@@ -118,14 +144,6 @@ def convert(
118144
self.print_progress(document, True, str(e), 0)
119145
document.mark_as_failed()
120146

121-
def ocr_page(self, pixmap: fitz.Pixmap, ocr_lang: str) -> bytes:
122-
"""Get a single page as pixels, OCR it, and return a PDF as bytes."""
123-
return pixmap.pdfocr_tobytes(
124-
compress=True,
125-
language=ocr_lang,
126-
tessdata=str(get_tessdata_dir()),
127-
)
128-
129147
def pixels_to_pdf_page(
130148
self,
131149
untrusted_data: bytes,
@@ -144,7 +162,11 @@ def pixels_to_pdf_page(
144162
pixmap.set_dpi(DEFAULT_DPI, DEFAULT_DPI)
145163

146164
if ocr_lang: # OCR the document
147-
page_pdf_bytes = self.ocr_page(pixmap, ocr_lang)
165+
page_pdf_bytes = pixmap.pdfocr_tobytes(
166+
compress=True,
167+
language=ocr_lang,
168+
tessdata=str(get_tessdata_dir()),
169+
)
148170
else: # Don't OCR
149171
page_doc = fitz.Document()
150172
page_doc.insert_file(pixmap)
@@ -175,35 +197,83 @@ def convert_with_proc(
175197

176198
safe_doc = fitz.Document()
177199

178-
for page in range(1, n_pages + 1):
179-
searchable = "searchable " if ocr_lang else ""
180-
text = (
181-
f"Converting page {page}/{n_pages} from pixels to {searchable}PDF"
182-
)
183-
self.print_progress(document, False, text, percentage)
184-
185-
width = read_int(p.stdout)
186-
height = read_int(p.stdout)
187-
if not (1 <= width <= errors.MAX_PAGE_WIDTH):
188-
raise errors.MaxPageWidthException()
189-
if not (1 <= height <= errors.MAX_PAGE_HEIGHT):
190-
raise errors.MaxPageHeightException()
191-
192-
num_pixels = width * height * 3 # three color channels
193-
untrusted_pixels = read_bytes(
194-
p.stdout,
195-
num_pixels,
196-
)
200+
# Use multiprocessing pool for OCR if needed
201+
if ocr_lang:
202+
max_workers = max(1, mp.cpu_count() - 1)
203+
ocr_pool = mp.Pool(processes=max_workers)
204+
async_results = {}
205+
# Pre-compute tessdata path to pass to workers (they can't access
206+
# sys.dangerzone_dev which is set only in the main process)
207+
tessdata_dir = str(get_tessdata_dir())
208+
else:
209+
ocr_pool = None
210+
tessdata_dir = None
197211

198-
page_pdf = self.pixels_to_pdf_page(
199-
untrusted_pixels,
200-
width,
201-
height,
202-
ocr_lang,
203-
)
204-
safe_doc.insert_pdf(page_pdf)
205-
206-
percentage += step
212+
try:
213+
for page in range(1, n_pages + 1):
214+
searchable = "searchable " if ocr_lang else ""
215+
text = f"Converting page {page}/{n_pages} from pixels to {searchable}PDF"
216+
self.print_progress(document, False, text, percentage)
217+
218+
width = read_int(p.stdout)
219+
height = read_int(p.stdout)
220+
if not (1 <= width <= errors.MAX_PAGE_WIDTH):
221+
raise errors.MaxPageWidthException()
222+
if not (1 <= height <= errors.MAX_PAGE_HEIGHT):
223+
raise errors.MaxPageHeightException()
224+
225+
num_pixels = width * height * 3 # three color channels
226+
untrusted_pixels = read_bytes(
227+
p.stdout,
228+
num_pixels,
229+
)
230+
231+
if ocr_lang:
232+
# Submit OCR task to worker pool
233+
assert ocr_pool is not None
234+
result = ocr_pool.apply_async(
235+
_ocr_page_worker,
236+
(untrusted_pixels, width, height, ocr_lang, tessdata_dir),
237+
)
238+
async_results[page] = result
239+
# Keep buffering pages, but collect results that are ready
240+
# to avoid queue overflow
241+
if len(async_results) > max_workers * 2:
242+
for ready_page in sorted(async_results.keys()):
243+
if ready_page in async_results:
244+
try:
245+
page_pdf_bytes = async_results[ready_page].get(
246+
timeout=0
247+
)
248+
page_doc = fitz.open("pdf", page_pdf_bytes)
249+
safe_doc.insert_pdf(page_doc)
250+
del async_results[ready_page]
251+
break
252+
except mp.TimeoutError:
253+
pass
254+
else:
255+
# No OCR: process immediately
256+
page_pdf = self.pixels_to_pdf_page(
257+
untrusted_pixels,
258+
width,
259+
height,
260+
None,
261+
)
262+
safe_doc.insert_pdf(page_pdf)
263+
264+
percentage += step
265+
266+
# Collect all remaining OCR results
267+
if ocr_lang:
268+
for page in sorted(async_results.keys()):
269+
page_pdf_bytes = async_results[page].get()
270+
page_doc = fitz.open("pdf", page_pdf_bytes)
271+
safe_doc.insert_pdf(page_doc)
272+
273+
finally:
274+
if ocr_pool is not None:
275+
ocr_pool.close()
276+
ocr_pool.join()
207277

208278
# Ensure nothing else is read after all bitmaps are obtained
209279
p.stdout.close()

0 commit comments

Comments
 (0)