-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpyproject.toml
More file actions
72 lines (67 loc) · 3.01 KB
/
pyproject.toml
File metadata and controls
72 lines (67 loc) · 3.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
[project]
name = "raincloud"
version = "0.1.1"
description = "Client-reproducible pipeline for building a curated catalog of public datasets as Parquet + Vortex files."
readme = "README.md"
requires-python = ">=3.11"
authors = [{ name = "Raincloud Maintainers", email = "raincloud@spiraldb.com" }]
license = "Apache-2.0"
license-files = ["LICENSE"]
dependencies = [
# Core Arrow / Parquet handling — used by every stage.
"pyarrow>=23.0",
"numpy>=2.0",
"duckdb>=1.5.0", # VARIANT type requires storage_compatibility_version ≥ v1.5.0
# Vortex conversion. The package was renamed from `vortex-array` to
# `vortex-data` in the 0.32 → 0.69 move and the API isn't yet stable
# across minor versions, so we cap at the next minor. Loosen once the
# public API (`vxio.write(pa_table, path)` is what we depend on) is
# backed by a stability guarantee.
"vortex-data>=0.69.0,<0.70.0",
# Streaming decompression formats surfaced in specific upstreams.
"zstandard>=0.25.0", # lichess .pgn.zst monthly dumps
"py7zr>=1.1.0", # 7z archives
"unlzw3>=0.2.0", # UCI diabetes .tar.Z (Unix compress / LZW)
# Format-specific parsers.
"pandas>=2.0", # xlsx_parse (online-retail-ii)
"openpyxl>=3.1", # pandas xlsx backend
"pyreadstat>=1.3", # sas_xpt_parse (CDC BRFSS .xpt)
"osmium>=4.3", # osm_pbf_split (OSM Germany Geofabrik extract)
"jsonschema>=4.0", # validate_manifest.py — sources.json structural checks
]
[project.optional-dependencies]
# Only needed for fetch.type = "kaggle" (≈9 slugs). Requires ~/.kaggle/kaggle.json
# credentials to be set up once per machine.
kaggle = ["kaggle>=2.0"]
# Only needed for fetch.type = "huggingface" (1 slug: dbpedia-embeddings).
huggingface = ["huggingface-hub>=0.25"]
# Read-only TUI browser for sources.json — `python -m scripts.pipeline.browse`.
tui = ["textual>=0.80"]
# Convenience alias.
all = [
"kaggle>=2.0",
"huggingface-hub>=0.25",
"textual>=0.80",
]
# Test runner + linter — install with `uv sync --extra dev`.
dev = ["pytest>=8.0", "ruff>=0.13"]
[tool.ruff]
line-length = 120
target-version = "py311"
[tool.ruff.lint]
# Conservative starter set: pyflakes (F), pycodestyle errors + warnings
# (E, W), and isort (I) for import ordering. Add B/UP/SIM later once the
# baseline is clean.
select = ["E", "F", "W", "I"]
# E501 line-too-long: `line-length = 120` gives headroom; rare comment-heavy
# lines shouldn't fail CI.
# E701 / E702 multiple-statements-on-one-line: deliberate column-aligned
# `if cond: return ...` lookup tables (see e.g. browse.py:219-222) read
# better as one-liners than split across multiple lines.
ignore = ["E501", "E701", "E702"]
[tool.uv]
# Currently consumed in-place from a clone, but the dependency posture
# (loose lower-bound constraints, no committed uv.lock) is set up for
# library publication. Flip `package = false` to `true` and add a build
# backend ([build-system]) when you're ready to ship.
package = false