raincloud/pyproject.toml at develop · spiraldb/raincloud · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
[project]
name = "raincloud"
version = "0.1.1"
description = "Client-reproducible pipeline for building a curated catalog of public datasets as Parquet + Vortex files."
readme = "README.md"
requires-python = ">=3.11"
authors = [{ name = "Raincloud Maintainers", email = "raincloud@spiraldb.com" }]
license = "Apache-2.0"
license-files = ["LICENSE"]

dependencies = [
    # Core Arrow / Parquet handling — used by every stage.
    "pyarrow>=23.0",
    "numpy>=2.0",
    "duckdb>=1.5.0",            # VARIANT type requires storage_compatibility_version ≥ v1.5.0
    # Vortex conversion. The package was renamed from `vortex-array` to
    # `vortex-data` in the 0.32 → 0.69 move and the API isn't yet stable
    # across minor versions, so we cap at the next minor. Loosen once the
    # public API (`vxio.write(pa_table, path)` is what we depend on) is
    # backed by a stability guarantee.
    "vortex-data>=0.69.0,<0.70.0",
    # Streaming decompression formats surfaced in specific upstreams.
    "zstandard>=0.25.0",        # lichess .pgn.zst monthly dumps
    "py7zr>=1.1.0",             # 7z archives
    "unlzw3>=0.2.0",            # UCI diabetes .tar.Z (Unix compress / LZW)
    # Format-specific parsers.
    "pandas>=2.0",              # xlsx_parse (online-retail-ii)
    "openpyxl>=3.1",            # pandas xlsx backend
    "pyreadstat>=1.3",          # sas_xpt_parse (CDC BRFSS .xpt)
    "osmium>=4.3",              # osm_pbf_split (OSM Germany Geofabrik extract)
    "jsonschema>=4.0",          # validate_manifest.py — sources.json structural checks
]

[project.optional-dependencies]
# Only needed for fetch.type = "kaggle" (≈9 slugs). Requires ~/.kaggle/kaggle.json
# credentials to be set up once per machine.
kaggle = ["kaggle>=2.0"]
# Only needed for fetch.type = "huggingface" (1 slug: dbpedia-embeddings).
huggingface = ["huggingface-hub>=0.25"]
# Read-only TUI browser for sources.json — `python -m scripts.pipeline.browse`.
tui = ["textual>=0.80"]
# Convenience alias.
all = [
    "kaggle>=2.0",
    "huggingface-hub>=0.25",
    "textual>=0.80",
]
# Test runner + linter — install with `uv sync --extra dev`.
dev = ["pytest>=8.0", "ruff>=0.13"]

[tool.ruff]
line-length = 120
target-version = "py311"

[tool.ruff.lint]
# Conservative starter set: pyflakes (F), pycodestyle errors + warnings
# (E, W), and isort (I) for import ordering. Add B/UP/SIM later once the
# baseline is clean.
select = ["E", "F", "W", "I"]
# E501 line-too-long: `line-length = 120` gives headroom; rare comment-heavy
# lines shouldn't fail CI.
# E701 / E702 multiple-statements-on-one-line: deliberate column-aligned
# `if cond: return ...` lookup tables (see e.g. browse.py:219-222) read
# better as one-liners than split across multiple lines.
ignore = ["E501", "E701", "E702"]

[tool.uv]
# Currently consumed in-place from a clone, but the dependency posture
# (loose lower-bound constraints, no committed uv.lock) is set up for
# library publication. Flip `package = false` to `true` and add a build
# backend ([build-system]) when you're ready to ship.
package = false