[WIP] Data/model storage. Fix 1453#1632
Conversation
|
|
||
|
|
||
| def _create_base_dir(): | ||
| r"""Create the gensim-data directory in home directory, if it has not been already created. |
There was a problem hiding this comment.
Is it really needed to add r for all docstrings ? What's a reason?
| sys.stdout.flush() | ||
|
|
||
|
|
||
| def _create_base_dir(): |
There was a problem hiding this comment.
Maybe use __ instead of _ will be better (for hiding from import), here and everywhere?
|
|
||
| if __name__ == '__main__': | ||
| logging.basicConfig(format='%(asctime)s :%(name)s :%(levelname)s :%(message)s', stream=sys.stdout, level=logging.INFO) | ||
| parser = argparse.ArgumentParser(description="Gensim console API", usage="python -m gensim.api.downloader [-h] [-d data__name | -i data__name | -c]") |
There was a problem hiding this comment.
No need to pass custom "usage" string here (argparse will generate it automatically)
| logging.basicConfig(format='%(asctime)s :%(name)s :%(levelname)s :%(message)s', stream=sys.stdout, level=logging.INFO) | ||
| parser = argparse.ArgumentParser(description="Gensim console API", usage="python -m gensim.api.downloader [-h] [-d data__name | -i data__name | -c]") | ||
| group = parser.add_mutually_exclusive_group() | ||
| group.add_argument("-d", "--download", metavar="data__name", nargs=1, help="To download a corpus/model : python -m gensim.downloader -d corpus/model name") |
There was a problem hiding this comment.
Strange names for metavar, why metavar is needed here?
| logger.info("%s downloaded", name) | ||
| else: | ||
| rmtree(tmp_dir) | ||
| raise Exception("There was a problem in downloading the data. We recommend you to re-try.") |
There was a problem hiding this comment.
Add info about checksums (concrete filename, expected checksum, real checksum, expected size, real size).
|
Great job @chaitaliSaini, now your code is more readable and clear (and works stable) 🔥 👍 @anotherbugmaster will review your docstrings today. |
There was a problem hiding this comment.
Good job, thank you! Fix the minor issues and check out this styleguide (in case you haven't yet), it will help you write consistent documentation:
https://github.com/numpy/numpy/blob/master/doc/HOWTO_DOCUMENT.rst.txt#docstring-standard
|
|
||
|
|
||
| def progress(chunks_downloaded, chunk_size, total_size): | ||
| r"""Create and update the progress bar. |
| filled_len = int(math.floor((bar_len * size_downloaded) / total_size)) | ||
| percent_downloaded = round((size_downloaded * 100) / total_size, 1) | ||
| bar = '=' * filled_len + '-' * (bar_len - filled_len) | ||
| sys.stdout.write('[%s] %s%s %s/%sMB downloaded\r' % (bar, percent_downloaded, "%", round(size_downloaded / (1024 * 1024), 1), round(float(total_size) / (1024 * 1024), 1))) |
|
|
||
|
|
||
| def _calculate_md5_checksum(tar_file): | ||
| r"""Calculate the checksum of the given tar.gz file. |
| def info(name=None): | ||
| r"""Return the information related to model/dataset. | ||
|
|
||
| If name is supplied, then information related to the given dataset/model will be returned. Otherwise detailed information of all model/datasets will be returned. |
| Returns | ||
| ------- | ||
| dict | ||
| Return detailed information about all models/datasets if name is not provided. Otherwise return detailed informtiona of the specific model/dataset |
| data: | ||
| load model to memory | ||
| data_dir: str | ||
| return path of dataset/model. |
There was a problem hiding this comment.
No new line after last section
|
|
||
| Parameters | ||
| ---------- | ||
| name : {None, data name}, optional |
There was a problem hiding this comment.
name : str or None, optional is the right way. Also try to write a description after every parameter.
| Parameters | ||
| ---------- | ||
| name: str | ||
| dataset/model name |
| Parameters | ||
| ---------- | ||
| name: str | ||
| dataset/model name which has to be downloaded |
| import numpy as np | ||
|
|
||
|
|
||
| class TestApi(unittest.TestCase): |
There was a problem hiding this comment.
Need to add test for multipart
| import math | ||
| import shutil | ||
| import tempfile | ||
| try: |
There was a problem hiding this comment.
One try/catch is enough here.
| Parameters | ||
| ---------- | ||
| chunks_downloaded : int | ||
| Number of chunks of data that have been downloaded |
There was a problem hiding this comment.
. at the end of sentence (here and anywhere)
|
|
||
| def _create_base_dir(): | ||
| """Create the gensim-data directory in home directory, if it has not been already created. | ||
| Raises |
There was a problem hiding this comment.
missing newline before section title
| """Create the gensim-data directory in home directory, if it has not been already created. | ||
| Raises | ||
| ------ | ||
| File Exists Error |
There was a problem hiding this comment.
Raises
---------
Exception
Two possible reasons: ...
| return data['models'][name]["checksum"] | ||
| else: | ||
| if name in corpora: | ||
| return data['corpora'][name]["checksum-" + str(part)] |
There was a problem hiding this comment.
"cheksum-{}".format(part) instead
| tmp_dir = tempfile.mkdtemp() | ||
| tmp_load_file_path = os.path.join(tmp_dir, "__init__.py") | ||
| urllib.urlretrieve(url_load_file, tmp_load_file_path) | ||
| no_parts = int(_get_parts(name)) |
There was a problem hiding this comment.
store it as int, don't cast
| compressed_folder_name = "{f}.tar.gz_a{p}".format(f=name, p=chr(96 + part)) | ||
| tmp_data_file_dir = os.path.join(tmp_dir, compressed_folder_name) | ||
| logger.info("Downloading Part %s/%s", part, no_parts) | ||
| urllib.urlretrieve(url_data, tmp_data_file_dir, reporthook=_progress) |
There was a problem hiding this comment.
Show part on progressbar
| concatenated_folder_dir = os.path.join(tmp_dir, concatenated_folder_name) | ||
| for part in range(1, no_parts + 1): | ||
| url_data = "https://github.com/chaitaliSaini/gensim-data/releases/download/{f}/{f}.tar.gz_a{p}".format(f=name, p=chr(96 + part)) | ||
| compressed_folder_name = "{f}.tar.gz_a{p}".format(f=name, p=chr(96 + part)) |
| os.remove(concatenated_folder_dir) | ||
| os.rename(tmp_dir, data_folder_dir) | ||
| else: | ||
| url_data = "https://github.com/chaitaliSaini/gensim-data/releases/download/{f}/{f}.tar.gz".format(f=name) |
| logger.info("%s \n", json.dumps(data['corpora'][name], indent=4)) | ||
| return data['corpora'][name] | ||
| elif name in models: | ||
| logger.info("%s \n", json.dumps(data['corpora'][name], indent=4)) |
There was a problem hiding this comment.
Bug data['corpora'][name] -> data['models'][name]
|
Finished in #1705 |
API for dataset/model storage (old PR #1492).