Skip to content

Commit 64f9a92

Browse files
authored
Fix segment-wiki script (#1694)
* Fix script docstring (format description), remove pruning through tokenization, more descriptive filed names, stdout support (as default option) * Add link to fresh en wiki * Add link to fresh en wiki, examples section, filter redirect * strip -> lstrip * Added time for processing & ignore empty articles * extend filtering section * reduce threshold * parametrize minimal article length, reduce to 200
1 parent c583b28 commit 64f9a92

1 file changed

Lines changed: 83 additions & 36 deletions

File tree

gensim/scripts/segment_wiki.py

Lines changed: 83 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,37 @@
55
# Copyright (C) 2016 RaRe Technologies
66

77
"""
8-
Construct a corpus from a Wikipedia (or other MediaWiki-based) database dump and extract sections of pages from it
9-
and save to json-line format.
10-
11-
If you have the `pattern` package installed, this module will use a fancy
12-
lemmatization to get a lemma of each token (instead of plain alphabetic
13-
tokenizer). The package is available at https://github.com/clips/pattern .
8+
Construct a corpus from a Wikipedia (or other MediaWiki-based) database dump (typical filename
9+
is <LANG>wiki-<YYYYMMDD>-pages-articles.xml.bz2 or <LANG>wiki-latest-pages-articles.xml.bz2),
10+
extract titles, section names, section content and save to json-line format,
11+
that contains 3 fields ::
12+
13+
'title' (str) - title of article,
14+
'section_titles' (list) - list of titles of sections,
15+
'section_texts' (list) - list of content from sections.
16+
17+
English Wikipedia dump available
18+
`here <https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2>`_. Approximate time
19+
for processing is 2.5 hours (i7-6700HQ, SSD).
20+
21+
Examples
22+
--------
23+
24+
Convert wiki to json-lines format:
25+
`python -m gensim.scripts.segment_wiki -f enwiki-latest-pages-articles.xml.bz2 | gzip > enwiki-latest.json.gz`
26+
27+
Read json-lines dump
28+
29+
>>> # iterate over the plain text file we just created
30+
>>> for line in smart_open('enwiki-latest.json.gz'):
31+
>>> # decode JSON into a Python object
32+
>>> article = json.loads(line)
33+
>>>
34+
>>> # each article has a "title", "section_titles" and "section_texts" fields
35+
>>> print("Article title: %s" % article['title'])
36+
>>> for section_title, section_text in zip(article['section_titles'], article['section_texts']):
37+
>>> print("Section title: %s" % section_title)
38+
>>> print("Section text: %s" % section_text)
1439
1540
"""
1641

@@ -22,15 +47,14 @@
2247
import sys
2348
from xml.etree import cElementTree
2449

25-
from gensim.corpora.wikicorpus import ARTICLE_MIN_WORDS, IGNORED_NAMESPACES, WikiCorpus, \
26-
filter_wiki, get_namespace, tokenize, utils
50+
from gensim.corpora.wikicorpus import IGNORED_NAMESPACES, WikiCorpus, filter_wiki, get_namespace, utils
2751
from smart_open import smart_open
2852

2953

3054
logger = logging.getLogger(__name__)
3155

3256

33-
def segment_all_articles(file_path):
57+
def segment_all_articles(file_path, min_article_character=200):
3458
"""Extract article titles and sections from a MediaWiki bz2 database dump.
3559
3660
Parameters
@@ -39,27 +63,30 @@ def segment_all_articles(file_path):
3963
Path to MediaWiki dump, typical filename is <LANG>wiki-<YYYYMMDD>-pages-articles.xml.bz2
4064
or <LANG>wiki-latest-pages-articles.xml.bz2.
4165
66+
min_article_character : int, optional
67+
Minimal number of character for article (except titles and leading gaps).
68+
4269
Yields
4370
------
4471
(str, list of (str, str))
4572
Structure contains (title, [(section_heading, section_content), ...]).
4673
4774
"""
4875
with smart_open(file_path, 'rb') as xml_fileobj:
49-
wiki_sections_corpus = WikiSectionsCorpus(xml_fileobj)
76+
wiki_sections_corpus = _WikiSectionsCorpus(xml_fileobj, min_article_character=min_article_character)
5077
wiki_sections_corpus.metadata = True
5178
wiki_sections_text = wiki_sections_corpus.get_texts_with_sections()
5279
for article_title, article_sections in wiki_sections_text:
5380
yield article_title, article_sections
5481

5582

56-
def segment_and_print_all_articles(file_path, output_file):
83+
def segment_and_write_all_articles(file_path, output_file, min_article_character=200):
5784
"""Write article title and sections to output_file,
5885
output_file is json-line file with 3 fields::
5986
60-
'tl' - title of article,
61-
'st' - list of titles of sections,
62-
'sc' - list of content from sections.
87+
'title' - title of article,
88+
'section_titles' - list of titles of sections,
89+
'section_texts' - list of content from sections.
6390
6491
Parameters
6592
----------
@@ -68,18 +95,28 @@ def segment_and_print_all_articles(file_path, output_file):
6895
or <LANG>wiki-latest-pages-articles.xml.bz2.
6996
7097
output_file : str
71-
Path to output file.
98+
Path to output file in json-lines format.
99+
100+
min_article_character : int, optional
101+
Minimal number of character for article (except titles and leading gaps).
72102
73103
"""
74-
with smart_open(output_file, 'w') as outfile:
75-
for idx, (article_title, article_sections) in enumerate(segment_all_articles(file_path)):
76-
output_data = {"tl": article_title, "st": [], "sc": []}
104+
if output_file is None:
105+
outfile = sys.stdout
106+
else:
107+
outfile = smart_open(output_file, 'wb')
108+
109+
try:
110+
for idx, (article_title, article_sections) in enumerate(segment_all_articles(file_path, min_article_character)):
111+
output_data = {"title": article_title, "section_titles": [], "section_texts": []}
77112
for section_heading, section_content in article_sections:
78-
output_data["st"].append(section_heading)
79-
output_data["sc"].append(section_content)
113+
output_data["section_titles"].append(section_heading)
114+
output_data["section_texts"].append(section_content)
80115
if (idx + 1) % 100000 == 0:
81116
logger.info("Processed #%d articles", idx + 1)
82117
outfile.write(json.dumps(output_data) + "\n")
118+
finally:
119+
outfile.close()
83120

84121

85122
def extract_page_xmls(f):
@@ -160,25 +197,28 @@ def segment(page_xml):
160197
return title, sections
161198

162199

163-
class WikiSectionsCorpus(WikiCorpus):
200+
class _WikiSectionsCorpus(WikiCorpus):
164201
"""Treat a wikipedia articles dump (<LANG>wiki-<YYYYMMDD>-pages-articles.xml.bz2
165202
or <LANG>wiki-latest-pages-articles.xml.bz2) as a (read-only) corpus.
166203
167204
The documents are extracted on-the-fly, so that the whole (massive) dump can stay compressed on disk.
168205
169206
"""
170-
def __init__(self, fileobj, processes=None, lemmatize=utils.has_pattern(), filter_namespaces=('0',)):
207+
def __init__(self, fileobj, min_article_character=200, processes=None,
208+
lemmatize=utils.has_pattern(), filter_namespaces=('0',)):
171209
"""
172210
Parameters
173211
----------
174212
fileobj : file
175213
File descriptor of MediaWiki dump.
176-
processes : int
214+
min_article_character : int, optional
215+
Minimal number of character for article (except titles and leading gaps).
216+
processes : int, optional
177217
Number of processes, max(1, multiprocessing.cpu_count() - 1) if None.
178-
lemmatize : bool
218+
lemmatize : bool, optional
179219
If `pattern` package is installed, use fancier shallow parsing to get token lemmas.
180220
Otherwise, use simple regexp tokenization.
181-
filter_namespaces : tuple of int
221+
filter_namespaces : tuple of int, optional
182222
Enumeration of namespaces that will be ignored.
183223
184224
"""
@@ -189,6 +229,7 @@ def __init__(self, fileobj, processes=None, lemmatize=utils.has_pattern(), filte
189229
processes = max(1, multiprocessing.cpu_count() - 1)
190230
self.processes = processes
191231
self.lemmatize = lemmatize
232+
self.min_article_character = min_article_character
192233

193234
def get_texts_with_sections(self):
194235
"""Iterate over the dump, returning titles and text versions of all sections of articles.
@@ -217,16 +258,15 @@ def get_texts_with_sections(self):
217258
# is dumb and would load the entire input into RAM at once...
218259
for group in utils.chunkize(page_xmls, chunksize=10 * self.processes, maxsize=1):
219260
for article_title, sections in pool.imap(segment, group): # chunksize=10):
220-
# article redirects and short stubs are pruned here
221-
num_total_tokens = 0
222-
for section_title, section_content in sections:
223-
if self.lemmatize:
224-
num_total_tokens += len(utils.lemmatize(section_content))
225-
else:
226-
num_total_tokens += len(tokenize(section_content))
227-
if num_total_tokens < ARTICLE_MIN_WORDS or \
228-
any(article_title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES):
261+
# article redirects are pruned here
262+
if any(article_title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES): # filter non-articles
263+
continue
264+
if not sections or sections[0][1].lstrip().lower().startswith("#redirect"): # filter redirect
229265
continue
266+
if sum(len(body.strip()) for (_, body) in sections) < self.min_article_character:
267+
# filter very short articles (trash)
268+
continue
269+
230270
articles += 1
231271
yield (article_title, sections)
232272
pool.terminate()
@@ -239,8 +279,15 @@ def get_texts_with_sections(self):
239279

240280
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, description=globals()['__doc__'])
241281
parser.add_argument('-f', '--file', help='Path to MediaWiki database dump', required=True)
242-
parser.add_argument('-o', '--output', help='Path to output file', required=True)
282+
parser.add_argument('-o', '--output', help='Path to output file (stdout if not specified)')
283+
parser.add_argument(
284+
'-m', '--min-article-character',
285+
help="Minimal number of character for article (except titles and leading gaps), "
286+
"if article contains less characters that this value, "
287+
"article will be filtered (will not be in the output file), default: %(default)s",
288+
default=200
289+
)
243290
args = parser.parse_args()
244-
segment_and_print_all_articles(args.file, args.output)
291+
segment_and_write_all_articles(args.file, args.output, args.min_article_character)
245292

246293
logger.info("finished running %s", sys.argv[0])

0 commit comments

Comments
 (0)