55# Copyright (C) 2016 RaRe Technologies
66
77"""
8- Construct a corpus from a Wikipedia (or other MediaWiki-based) database dump and extract sections of pages from it
9- and save to json-line format.
10-
11- If you have the `pattern` package installed, this module will use a fancy
12- lemmatization to get a lemma of each token (instead of plain alphabetic
13- tokenizer). The package is available at https://github.com/clips/pattern .
8+ Construct a corpus from a Wikipedia (or other MediaWiki-based) database dump (typical filename
9+ is <LANG>wiki-<YYYYMMDD>-pages-articles.xml.bz2 or <LANG>wiki-latest-pages-articles.xml.bz2),
10+ extract titles, section names, section content and save to json-line format,
11+ that contains 3 fields ::
12+
13+ 'title' (str) - title of article,
14+ 'section_titles' (list) - list of titles of sections,
15+ 'section_texts' (list) - list of content from sections.
16+
17+ English Wikipedia dump available
18+ `here <https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2>`_. Approximate time
19+ for processing is 2.5 hours (i7-6700HQ, SSD).
20+
21+ Examples
22+ --------
23+
24+ Convert wiki to json-lines format:
25+ `python -m gensim.scripts.segment_wiki -f enwiki-latest-pages-articles.xml.bz2 | gzip > enwiki-latest.json.gz`
26+
27+ Read json-lines dump
28+
29+ >>> # iterate over the plain text file we just created
30+ >>> for line in smart_open('enwiki-latest.json.gz'):
31+ >>> # decode JSON into a Python object
32+ >>> article = json.loads(line)
33+ >>>
34+ >>> # each article has a "title", "section_titles" and "section_texts" fields
35+ >>> print("Article title: %s" % article['title'])
36+ >>> for section_title, section_text in zip(article['section_titles'], article['section_texts']):
37+ >>> print("Section title: %s" % section_title)
38+ >>> print("Section text: %s" % section_text)
1439
1540"""
1641
2247import sys
2348from xml .etree import cElementTree
2449
25- from gensim .corpora .wikicorpus import ARTICLE_MIN_WORDS , IGNORED_NAMESPACES , WikiCorpus , \
26- filter_wiki , get_namespace , tokenize , utils
50+ from gensim .corpora .wikicorpus import IGNORED_NAMESPACES , WikiCorpus , filter_wiki , get_namespace , utils
2751from smart_open import smart_open
2852
2953
3054logger = logging .getLogger (__name__ )
3155
3256
33- def segment_all_articles (file_path ):
57+ def segment_all_articles (file_path , min_article_character = 200 ):
3458 """Extract article titles and sections from a MediaWiki bz2 database dump.
3559
3660 Parameters
@@ -39,27 +63,30 @@ def segment_all_articles(file_path):
3963 Path to MediaWiki dump, typical filename is <LANG>wiki-<YYYYMMDD>-pages-articles.xml.bz2
4064 or <LANG>wiki-latest-pages-articles.xml.bz2.
4165
66+ min_article_character : int, optional
67+ Minimal number of character for article (except titles and leading gaps).
68+
4269 Yields
4370 ------
4471 (str, list of (str, str))
4572 Structure contains (title, [(section_heading, section_content), ...]).
4673
4774 """
4875 with smart_open (file_path , 'rb' ) as xml_fileobj :
49- wiki_sections_corpus = WikiSectionsCorpus (xml_fileobj )
76+ wiki_sections_corpus = _WikiSectionsCorpus (xml_fileobj , min_article_character = min_article_character )
5077 wiki_sections_corpus .metadata = True
5178 wiki_sections_text = wiki_sections_corpus .get_texts_with_sections ()
5279 for article_title , article_sections in wiki_sections_text :
5380 yield article_title , article_sections
5481
5582
56- def segment_and_print_all_articles (file_path , output_file ):
83+ def segment_and_write_all_articles (file_path , output_file , min_article_character = 200 ):
5784 """Write article title and sections to output_file,
5885 output_file is json-line file with 3 fields::
5986
60- 'tl ' - title of article,
61- 'st ' - list of titles of sections,
62- 'sc ' - list of content from sections.
87+ 'title ' - title of article,
88+ 'section_titles ' - list of titles of sections,
89+ 'section_texts ' - list of content from sections.
6390
6491 Parameters
6592 ----------
@@ -68,18 +95,28 @@ def segment_and_print_all_articles(file_path, output_file):
6895 or <LANG>wiki-latest-pages-articles.xml.bz2.
6996
7097 output_file : str
71- Path to output file.
98+ Path to output file in json-lines format.
99+
100+ min_article_character : int, optional
101+ Minimal number of character for article (except titles and leading gaps).
72102
73103 """
74- with smart_open (output_file , 'w' ) as outfile :
75- for idx , (article_title , article_sections ) in enumerate (segment_all_articles (file_path )):
76- output_data = {"tl" : article_title , "st" : [], "sc" : []}
104+ if output_file is None :
105+ outfile = sys .stdout
106+ else :
107+ outfile = smart_open (output_file , 'wb' )
108+
109+ try :
110+ for idx , (article_title , article_sections ) in enumerate (segment_all_articles (file_path , min_article_character )):
111+ output_data = {"title" : article_title , "section_titles" : [], "section_texts" : []}
77112 for section_heading , section_content in article_sections :
78- output_data ["st " ].append (section_heading )
79- output_data ["sc " ].append (section_content )
113+ output_data ["section_titles " ].append (section_heading )
114+ output_data ["section_texts " ].append (section_content )
80115 if (idx + 1 ) % 100000 == 0 :
81116 logger .info ("Processed #%d articles" , idx + 1 )
82117 outfile .write (json .dumps (output_data ) + "\n " )
118+ finally :
119+ outfile .close ()
83120
84121
85122def extract_page_xmls (f ):
@@ -160,25 +197,28 @@ def segment(page_xml):
160197 return title , sections
161198
162199
163- class WikiSectionsCorpus (WikiCorpus ):
200+ class _WikiSectionsCorpus (WikiCorpus ):
164201 """Treat a wikipedia articles dump (<LANG>wiki-<YYYYMMDD>-pages-articles.xml.bz2
165202 or <LANG>wiki-latest-pages-articles.xml.bz2) as a (read-only) corpus.
166203
167204 The documents are extracted on-the-fly, so that the whole (massive) dump can stay compressed on disk.
168205
169206 """
170- def __init__ (self , fileobj , processes = None , lemmatize = utils .has_pattern (), filter_namespaces = ('0' ,)):
207+ def __init__ (self , fileobj , min_article_character = 200 , processes = None ,
208+ lemmatize = utils .has_pattern (), filter_namespaces = ('0' ,)):
171209 """
172210 Parameters
173211 ----------
174212 fileobj : file
175213 File descriptor of MediaWiki dump.
176- processes : int
214+ min_article_character : int, optional
215+ Minimal number of character for article (except titles and leading gaps).
216+ processes : int, optional
177217 Number of processes, max(1, multiprocessing.cpu_count() - 1) if None.
178- lemmatize : bool
218+ lemmatize : bool, optional
179219 If `pattern` package is installed, use fancier shallow parsing to get token lemmas.
180220 Otherwise, use simple regexp tokenization.
181- filter_namespaces : tuple of int
221+ filter_namespaces : tuple of int, optional
182222 Enumeration of namespaces that will be ignored.
183223
184224 """
@@ -189,6 +229,7 @@ def __init__(self, fileobj, processes=None, lemmatize=utils.has_pattern(), filte
189229 processes = max (1 , multiprocessing .cpu_count () - 1 )
190230 self .processes = processes
191231 self .lemmatize = lemmatize
232+ self .min_article_character = min_article_character
192233
193234 def get_texts_with_sections (self ):
194235 """Iterate over the dump, returning titles and text versions of all sections of articles.
@@ -217,16 +258,15 @@ def get_texts_with_sections(self):
217258 # is dumb and would load the entire input into RAM at once...
218259 for group in utils .chunkize (page_xmls , chunksize = 10 * self .processes , maxsize = 1 ):
219260 for article_title , sections in pool .imap (segment , group ): # chunksize=10):
220- # article redirects and short stubs are pruned here
221- num_total_tokens = 0
222- for section_title , section_content in sections :
223- if self .lemmatize :
224- num_total_tokens += len (utils .lemmatize (section_content ))
225- else :
226- num_total_tokens += len (tokenize (section_content ))
227- if num_total_tokens < ARTICLE_MIN_WORDS or \
228- any (article_title .startswith (ignore + ':' ) for ignore in IGNORED_NAMESPACES ):
261+ # article redirects are pruned here
262+ if any (article_title .startswith (ignore + ':' ) for ignore in IGNORED_NAMESPACES ): # filter non-articles
263+ continue
264+ if not sections or sections [0 ][1 ].lstrip ().lower ().startswith ("#redirect" ): # filter redirect
229265 continue
266+ if sum (len (body .strip ()) for (_ , body ) in sections ) < self .min_article_character :
267+ # filter very short articles (trash)
268+ continue
269+
230270 articles += 1
231271 yield (article_title , sections )
232272 pool .terminate ()
@@ -239,8 +279,15 @@ def get_texts_with_sections(self):
239279
240280 parser = argparse .ArgumentParser (formatter_class = argparse .RawTextHelpFormatter , description = globals ()['__doc__' ])
241281 parser .add_argument ('-f' , '--file' , help = 'Path to MediaWiki database dump' , required = True )
242- parser .add_argument ('-o' , '--output' , help = 'Path to output file' , required = True )
282+ parser .add_argument ('-o' , '--output' , help = 'Path to output file (stdout if not specified)' )
283+ parser .add_argument (
284+ '-m' , '--min-article-character' ,
285+ help = "Minimal number of character for article (except titles and leading gaps), "
286+ "if article contains less characters that this value, "
287+ "article will be filtered (will not be in the output file), default: %(default)s" ,
288+ default = 200
289+ )
243290 args = parser .parse_args ()
244- segment_and_print_all_articles (args .file , args .output )
291+ segment_and_write_all_articles (args .file , args .output , args . min_article_character )
245292
246293 logger .info ("finished running %s" , sys .argv [0 ])
0 commit comments