99import multiprocessing
1010import os
1111import re
12+ import shutil
1213import signal
1314import timeit
1415import unicodedata
1516from urllib .parse import urlparse , urlunparse
1617
18+ import cairosvg
19+ import docx
20+ import docx2txt
1721import easyocr
1822import fitz
1923import numpy as np
2024import pandas as pd
25+ import pptx
2126import requests
2227import yaml
2328from bs4 import BeautifulSoup
2732 UnstructuredHTMLLoader ,
2833 UnstructuredImageLoader ,
2934 UnstructuredMarkdownLoader ,
30- UnstructuredPowerPointLoader ,
3135 UnstructuredXMLLoader ,
3236)
3337from langchain_community .llms import HuggingFaceEndpoint
@@ -131,32 +135,81 @@ def load_txt(txt_path):
131135
132136def load_doc (doc_path ):
133137 """Load doc file."""
134- txt_path = doc_path .replace (".doc" , ".txt" )
135- try :
136- os .system (f'antiword "{ doc_path } " > "{ txt_path } "' )
137- except :
138- raise AssertionError (
139- "antiword failed or not installed, if not installed,"
140- + 'use "apt-get update && apt-get install -y antiword" to install it.'
141- )
142- text = load_txt (txt_path )
143- os .remove (txt_path )
138+ print ("Converting doc file to docx file..." )
139+ docx_path = doc_path + "x"
140+ os .system (f"libreoffice --headless --invisible --convert-to docx --outdir { os .path .dirname (docx_path )} { doc_path } " )
141+ print ("Converted doc file to docx file." )
142+ text = load_docx (docx_path )
143+ os .remove (docx_path )
144144 return text
145145
146146
147147def load_docx (docx_path ):
148148 """Load docx file."""
149- doc = DDocument (docx_path )
149+ doc = docx . Document (docx_path )
150150 text = ""
151+ # Save all 'rId:filenames' relationships in an dictionary and save the images if any.
152+ rid2img = {}
153+ for r in doc .part .rels .values ():
154+ if isinstance (r ._target , docx .parts .image .ImagePart ):
155+ rid2img [r .rId ] = os .path .basename (r ._target .partname )
156+ if rid2img :
157+ save_path = "./imgs/"
158+ os .makedirs (save_path , exist_ok = True )
159+ docx2txt .process (docx_path , save_path )
151160 for paragraph in doc .paragraphs :
152- text += paragraph .text
161+ if hasattr (paragraph , "text" ):
162+ text += paragraph .text + "\n "
163+ if "graphicData" in paragraph ._p .xml :
164+ for rid in rid2img :
165+ if rid in paragraph ._p .xml :
166+ img_path = os .path .join (save_path , rid2img [rid ])
167+ img_text = load_image (img_path )
168+ if img_text :
169+ text += img_text + "\n "
170+ if rid2img :
171+ shutil .rmtree (save_path )
172+ return text
173+
174+
175+ def load_ppt (ppt_path ):
176+ """Load ppt file."""
177+ print ("Converting ppt file to pptx file..." )
178+ pptx_path = ppt_path + "x"
179+ os .system (f"libreoffice --headless --invisible --convert-to pptx --outdir { os .path .dirname (pptx_path )} { ppt_path } " )
180+ print ("Converted ppt file to pptx file." )
181+ text = load_pptx (pptx_path )
182+ os .remove (pptx_path )
153183 return text
154184
155185
156186def load_pptx (pptx_path ):
157187 """Load pptx file."""
158- loader = UnstructuredPowerPointLoader (pptx_path )
159- text = loader .load ()[0 ].page_content
188+ text = ""
189+ prs = pptx .Presentation (pptx_path )
190+ for slide in prs .slides :
191+ for shape in sorted (slide .shapes , key = lambda shape : (shape .top , shape .left )):
192+ if shape .has_text_frame :
193+ if shape .text :
194+ text += shape .text + "\n "
195+ if shape .has_table :
196+ table_contents = "\n " .join (
197+ [
198+ "\t " .join ([(cell .text if hasattr (cell , "text" ) else "" ) for cell in row .cells ])
199+ for row in shape .table .rows
200+ if hasattr (row , "cells" )
201+ ]
202+ )
203+ if table_contents :
204+ text += table_contents + "\n "
205+ if hasattr (shape , "image" ) and hasattr (shape .image , "blob" ):
206+ img_path = f"./{ shape .image .filename } "
207+ with open (img_path , "wb" ) as f :
208+ f .write (shape .image .blob )
209+ img_text = load_image (img_path )
210+ if img_text :
211+ text += img_text + "\n "
212+ os .remove (img_path )
160213 return text
161214
162215
@@ -214,13 +267,11 @@ def load_image(image_path):
214267 return response .json ()["text" ].strip ()
215268 loader = UnstructuredImageLoader (image_path )
216269 text = loader .load ()[0 ].page_content
217- return text
270+ return text . strip ()
218271
219272
220273def load_svg (svg_path ):
221274 """Load the svg file."""
222- import cairosvg
223-
224275 png_path = svg_path .replace (".svg" , ".png" )
225276 cairosvg .svg2png (url = svg_path , write_to = png_path )
226277 text = load_image (png_path )
@@ -239,7 +290,9 @@ def document_loader(doc_path):
239290 return load_doc (doc_path )
240291 elif doc_path .endswith (".docx" ):
241292 return load_docx (doc_path )
242- elif doc_path .endswith (".pptx" ) or doc_path .endswith (".ppt" ):
293+ elif doc_path .endswith (".ppt" ):
294+ return load_ppt (doc_path )
295+ elif doc_path .endswith (".pptx" ):
243296 return load_pptx (doc_path )
244297 elif doc_path .endswith (".md" ):
245298 return load_md (doc_path )
@@ -261,7 +314,7 @@ def document_loader(doc_path):
261314 ):
262315 return load_image (doc_path )
263316 elif doc_path .endswith (".svg" ):
264- return load_image (doc_path )
317+ return load_svg (doc_path )
265318 else :
266319 raise NotImplementedError (
267320 "Current only support pdf, html, txt, doc, docx, pptx, ppt, md, xml"
0 commit comments