11import contextlib
22import logging
3+ import multiprocessing as mp
34import os
45import platform
6+ import queue
57import signal
68import subprocess
79import sys
@@ -79,6 +81,30 @@ def sanitize_debug_text(text: bytes) -> str:
7981 return replace_control_chars (untrusted_text , keep_newlines = True )
8082
8183
84+ def _ocr_page_worker (
85+ pixmap_bytes : bytes , width : int , height : int , ocr_lang : str , tessdata_dir : str
86+ ) -> bytes :
87+ """Worker function for multiprocessing OCR. Returns PDF bytes."""
88+ try :
89+ pixmap = fitz .Pixmap (
90+ fitz .Colorspace (fitz .CS_RGB ),
91+ width ,
92+ height ,
93+ pixmap_bytes ,
94+ False ,
95+ )
96+ pixmap .set_dpi (DEFAULT_DPI , DEFAULT_DPI )
97+ return pixmap .pdfocr_tobytes (
98+ compress = True ,
99+ language = ocr_lang ,
100+ tessdata = tessdata_dir ,
101+ )
102+ except Exception as e :
103+ # Re-raise with a picklable exception to avoid multiprocessing errors
104+ # when the original exception contains unpicklable SWIG objects
105+ raise RuntimeError (str (e )) from None
106+
107+
82108class IsolationProvider (ABC ):
83109 """
84110 Abstracts an isolation provider
@@ -118,14 +144,6 @@ def convert(
118144 self .print_progress (document , True , str (e ), 0 )
119145 document .mark_as_failed ()
120146
121- def ocr_page (self , pixmap : fitz .Pixmap , ocr_lang : str ) -> bytes :
122- """Get a single page as pixels, OCR it, and return a PDF as bytes."""
123- return pixmap .pdfocr_tobytes (
124- compress = True ,
125- language = ocr_lang ,
126- tessdata = str (get_tessdata_dir ()),
127- )
128-
129147 def pixels_to_pdf_page (
130148 self ,
131149 untrusted_data : bytes ,
@@ -144,7 +162,11 @@ def pixels_to_pdf_page(
144162 pixmap .set_dpi (DEFAULT_DPI , DEFAULT_DPI )
145163
146164 if ocr_lang : # OCR the document
147- page_pdf_bytes = self .ocr_page (pixmap , ocr_lang )
165+ page_pdf_bytes = pixmap .pdfocr_tobytes (
166+ compress = True ,
167+ language = ocr_lang ,
168+ tessdata = str (get_tessdata_dir ()),
169+ )
148170 else : # Don't OCR
149171 page_doc = fitz .Document ()
150172 page_doc .insert_file (pixmap )
@@ -175,35 +197,83 @@ def convert_with_proc(
175197
176198 safe_doc = fitz .Document ()
177199
178- for page in range (1 , n_pages + 1 ):
179- searchable = "searchable " if ocr_lang else ""
180- text = (
181- f"Converting page { page } /{ n_pages } from pixels to { searchable } PDF"
182- )
183- self .print_progress (document , False , text , percentage )
184-
185- width = read_int (p .stdout )
186- height = read_int (p .stdout )
187- if not (1 <= width <= errors .MAX_PAGE_WIDTH ):
188- raise errors .MaxPageWidthException ()
189- if not (1 <= height <= errors .MAX_PAGE_HEIGHT ):
190- raise errors .MaxPageHeightException ()
191-
192- num_pixels = width * height * 3 # three color channels
193- untrusted_pixels = read_bytes (
194- p .stdout ,
195- num_pixels ,
196- )
200+ # Use multiprocessing pool for OCR if needed
201+ if ocr_lang :
202+ max_workers = max (1 , mp .cpu_count () - 1 )
203+ ocr_pool = mp .Pool (processes = max_workers )
204+ async_results = {}
205+ # Pre-compute tessdata path to pass to workers (they can't access
206+ # sys.dangerzone_dev which is set only in the main process)
207+ tessdata_dir = str (get_tessdata_dir ())
208+ else :
209+ ocr_pool = None
210+ tessdata_dir = None
197211
198- page_pdf = self .pixels_to_pdf_page (
199- untrusted_pixels ,
200- width ,
201- height ,
202- ocr_lang ,
203- )
204- safe_doc .insert_pdf (page_pdf )
205-
206- percentage += step
212+ try :
213+ for page in range (1 , n_pages + 1 ):
214+ searchable = "searchable " if ocr_lang else ""
215+ text = f"Converting page { page } /{ n_pages } from pixels to { searchable } PDF"
216+ self .print_progress (document , False , text , percentage )
217+
218+ width = read_int (p .stdout )
219+ height = read_int (p .stdout )
220+ if not (1 <= width <= errors .MAX_PAGE_WIDTH ):
221+ raise errors .MaxPageWidthException ()
222+ if not (1 <= height <= errors .MAX_PAGE_HEIGHT ):
223+ raise errors .MaxPageHeightException ()
224+
225+ num_pixels = width * height * 3 # three color channels
226+ untrusted_pixels = read_bytes (
227+ p .stdout ,
228+ num_pixels ,
229+ )
230+
231+ if ocr_lang :
232+ # Submit OCR task to worker pool
233+ assert ocr_pool is not None
234+ result = ocr_pool .apply_async (
235+ _ocr_page_worker ,
236+ (untrusted_pixels , width , height , ocr_lang , tessdata_dir ),
237+ )
238+ async_results [page ] = result
239+ # Keep buffering pages, but collect results that are ready
240+ # to avoid queue overflow
241+ if len (async_results ) > max_workers * 2 :
242+ for ready_page in sorted (async_results .keys ()):
243+ if ready_page in async_results :
244+ try :
245+ page_pdf_bytes = async_results [ready_page ].get (
246+ timeout = 0
247+ )
248+ page_doc = fitz .open ("pdf" , page_pdf_bytes )
249+ safe_doc .insert_pdf (page_doc )
250+ del async_results [ready_page ]
251+ break
252+ except mp .TimeoutError :
253+ pass
254+ else :
255+ # No OCR: process immediately
256+ page_pdf = self .pixels_to_pdf_page (
257+ untrusted_pixels ,
258+ width ,
259+ height ,
260+ None ,
261+ )
262+ safe_doc .insert_pdf (page_pdf )
263+
264+ percentage += step
265+
266+ # Collect all remaining OCR results
267+ if ocr_lang :
268+ for page in sorted (async_results .keys ()):
269+ page_pdf_bytes = async_results [page ].get ()
270+ page_doc = fitz .open ("pdf" , page_pdf_bytes )
271+ safe_doc .insert_pdf (page_doc )
272+
273+ finally :
274+ if ocr_pool is not None :
275+ ocr_pool .close ()
276+ ocr_pool .join ()
207277
208278 # Ensure nothing else is read after all bitmaps are obtained
209279 p .stdout .close ()
0 commit comments