Skip to content

Commit 2c96404

Browse files
authored
Merge pull request #260 from Princeton-CDH/feature/aggregate-counts
Feature/aggregate counts
2 parents 5dab476 + 08735c9 commit 2c96404

8 files changed

Lines changed: 821 additions & 138 deletions

File tree

sample_config.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,11 @@ reference_corpora:
2626
# internet_poems:
2727
# tarball of directory of text files OR expanded directory;
2828
# some functionality will only work with the expanded directory
29-
# text_dir: "internet_poems/internet_poems_texts.tar.gz"
29+
# text_path: "internet_poems/internet_poems_texts.tar.gz"
3030
# chadwyck-healey:
3131
# tarball of directory of text files OR expanded directory;
3232
# some functionality will only work with the expanded directory
33-
# text_dir: "chadwyck-healey/chadwyck-healey_texts.tar.gz"
33+
# text_path: "chadwyck-healey/chadwyck-healey_texts.tar.gz"
3434
# metadata_path: "chadwyck-healey/chadwyck-healey.csv"
3535
other:
3636
# Provide a URL or local path to "Other Poems" metadata

src/corppa/config.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,11 @@
2929
"base_dir": "ref-corpora",
3030
# paths are relative to base_dir
3131
"internet_poems": {
32-
# tarball of directory of text files OR expanded directory
33-
"text_dir": "internet_poems/internet_poems_texts.tar.gz"
32+
# tarball of text files OR expanded directory
33+
"text_path": "internet_poems/internet_poems_texts.tar.gz"
3434
},
3535
"chadwyck-healey": {
36-
"text_dir": "chadwyck-healey/chadwyck-healey_texts.tar.gz",
36+
"text_path": "chadwyck-healey/chadwyck-healey_texts.tar.gz",
3737
"metadata_path": "chadwyck-healey/chadwyck-healey.csv",
3838
},
3939
# other poems metadata_path configuration required

src/corppa/poetry_detection/compile_dataset.py

Lines changed: 139 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
1212
To run one or more specific steps, specify which steps you want to run.
1313
Any string that is distinct will be enough to select the step.
14+
1415
```console
1516
compile-dataset --merge
1617
compile-dataset --poem-metadata
@@ -30,15 +31,18 @@
3031

3132
from corppa.config import get_config
3233
from corppa.poetry_detection.merge_excerpts import merge_excerpt_files
33-
34-
# from corppa.utils.path_utils import find_relative_paths
34+
from corppa.poetry_detection.polars_utils import add_ref_poems_meta
35+
from corppa.poetry_detection.ppa_works import extract_page_meta
3536
from corppa.poetry_detection.ref_corpora import save_poem_metadata
3637

3738
DEFAULT_CONFIGS = {
3839
"source_excerpt_data": "excerpt-data",
3940
"source_ppa_metadata": "ppa-data/ppa_works.csv",
4041
}
4142

43+
#: compile script config options, for run_step method type hints
44+
CompileOpts = dict[str, pathlib.Path]
45+
4246

4347
def load_compilation_config():
4448
"""Load configuration for dataset compilation,
@@ -132,31 +136,74 @@ def load_compilation_config():
132136
}
133137

134138

139+
def load_compiled_excerpts(config: CompileOpts) -> pl.DataFrame:
140+
"""Load compiled excerpts from CSV or compressed CSV file
141+
based on configured path, whichever file exists (uncompressed first).
142+
Raises a ValuError if neither file exists.
143+
"""
144+
for datafile in [
145+
config["compiled_excerpt_file"],
146+
config["compressed_excerpt_file"],
147+
]:
148+
if datafile.exists():
149+
# extract ppa work id and page number (needed for both poem and ppa metadata)
150+
return extract_page_meta(pl.read_csv(datafile))
151+
raise ValueError(
152+
f"Excerpt data file not found (checked {config['compiled_excerpt_file']} and {config['compressed_excerpt_file']}"
153+
)
154+
155+
135156
def get_excerpt_sources(excerpt_data_dir: pathlib.Path) -> list[pathlib.Path]:
157+
"""
158+
Find all CSV and compressed CSV files in a directory.
159+
"""
136160
return list(excerpt_data_dir.glob("**/*.csv")) + list(
137161
excerpt_data_dir.glob("**/*.csv.gz")
138162
)
139-
# wondered about using find_relative_paths here, but we actually
140-
# want non-relative paths and we need to handle a two-part extension
141-
# return [
142-
# excerpt_data_dir / rel_path
143-
# for rel_path in find_relative_paths(excerpt_data_dir, exts=[".csv", ".gz"]) # can we assume .gz == .csv.gz ?
144-
# ]
145163

146164

147-
def save_ppa_metadata(input_file: pathlib.Path, output_file: pathlib.Path):
165+
def save_ppa_metadata(
166+
input_file: pathlib.Path, output_file: pathlib.Path, excerpts_df: pl.DataFrame
167+
):
168+
"""
169+
Save PPA work metadata with work-level excerpt totals.
170+
Takes a PPA metadata file as input, a path for the output file,
171+
and a dataframe of merged excerpt data.
172+
Raises a ValueError if metadata file is not a CSV.
173+
"""
148174
# copy as-is, do not rename or subset any fields
149175
# NOTE: currently assumes and only supports PPA metadata in csv format
150176
if input_file.suffix != ".csv":
151177
raise ValueError(
152178
f"PPA metadata must be loaded as CSV, got {input_file.suffix.lstrip('.')}"
153179
)
154180
ppa_meta_df = pl.read_csv(input_file)
155-
# TODO: add aggregate counts here
181+
182+
# get work-level aggregate excerpt totals
183+
excerpt_totals_df = excerpts_df.group_by("ppa_work_id").agg(
184+
pl.col("excerpt_id").n_unique().alias("num_excerpts"),
185+
pl.col("poem_id").n_unique().alias("num_poems"),
186+
pl.col("poem_author").n_unique().alias("num_poets"),
187+
)
188+
189+
# combine the totals with ppa work metadata
190+
ppa_meta_df = ppa_meta_df.join(
191+
excerpt_totals_df, left_on="work_id", right_on="ppa_work_id", how="left"
192+
).with_columns(
193+
# fill any missing values with zeroes
194+
pl.col("num_excerpts").fill_null(pl.lit(0)),
195+
pl.col("num_poems").fill_null(pl.lit(0)),
196+
pl.col("num_poets").fill_null(pl.lit(0)),
197+
)
198+
156199
ppa_meta_df.write_csv(output_file)
157200

158201

159-
def compress_file(uncompressed_file, compressed_file):
202+
def compress_file(uncompressed_file: pathlib.Path, compressed_file: pathlib.Path):
203+
"""
204+
Compress the `uncompressed_file` passed in with gzip,
205+
saving it at the `compressed_file` path and deleting the original.
206+
"""
160207
with open(str(uncompressed_file), "rb") as inputfile:
161208
with gzip.open(str(compressed_file), "wb") as output_file:
162209
shutil.copyfileobj(inputfile, output_file)
@@ -165,7 +212,73 @@ def compress_file(uncompressed_file, compressed_file):
165212
uncompressed_file.unlink()
166213

167214

168-
def main():
215+
def run_merge_step(
216+
compile_opts: CompileOpts, excerpts_df: pl.DataFrame | None, compress_excerpts: bool
217+
) -> pl.DataFrame:
218+
"""Run the merge excerpts step. Finds source excerpt files from the configured
219+
path, merges excerpts, saves to CSV, and optionally compresses the CSV file.
220+
"""
221+
print("## Merging excerpts")
222+
excerpt_sources = get_excerpt_sources(compile_opts["source_excerpt_data"])
223+
excerpts_df = merge_excerpt_files(
224+
excerpt_sources, compile_opts["compiled_excerpt_file"]
225+
)
226+
if compress_excerpts:
227+
print(
228+
f"Compressing excerpt data... {compile_opts['compiled_excerpt_file']}{compile_opts['compressed_excerpt_file']}"
229+
)
230+
compress_file(
231+
compile_opts["compiled_excerpt_file"],
232+
compile_opts["compressed_excerpt_file"],
233+
)
234+
return excerpts_df
235+
236+
237+
def run_poem_metadata_step(
238+
compile_opts: CompileOpts, excerpts_df: pl.DataFrame | None = None
239+
) -> None:
240+
"""Run the poem metadata compilation step. Uses excerpt data
241+
(passed in or loaded from compile opts path) to calculate
242+
poem excerpt totals.
243+
"""
244+
print("\n## Compiling reference corpora metadata")
245+
if excerpts_df is None:
246+
excerpts_df = load_compiled_excerpts(compile_opts)
247+
else:
248+
excerpts_df = extract_page_meta(excerpts_df)
249+
save_poem_metadata(compile_opts["poem_metadata_file"], excerpts_df)
250+
251+
252+
def run_ppa_metadata_step(
253+
compile_opts: CompileOpts, excerpts_df: pl.DataFrame | None = None
254+
) -> None:
255+
"""Run the PPA metadata compilation step. Uses excerpt data (passed
256+
in or loaded from compile opts path) to calculate work-level
257+
excerpt totals.
258+
"""
259+
print("\n## PPA work-level metadata")
260+
if excerpts_df is None:
261+
excerpts_df = load_compiled_excerpts(compile_opts)
262+
else:
263+
excerpts_df = extract_page_meta(excerpts_df)
264+
265+
excerpts_df = add_ref_poems_meta(excerpts_df, compile_opts["poem_metadata_file"])
266+
267+
save_ppa_metadata(
268+
compile_opts["source_ppa_metadata"],
269+
compile_opts["ppa_metadata_file"],
270+
excerpts_df,
271+
)
272+
273+
274+
def main(cmd_args=None) -> None:
275+
"""
276+
Main entry point for the dataset compilation script. Parses
277+
arguments to determine which steps to run.
278+
"""
279+
# allow passing arguments in; if not specified, draw from sys.argv/command line
280+
if cmd_args is None:
281+
cmd_args = sys.argv[1:]
169282
parser = argparse.ArgumentParser(description="Compile PPA found-poems dataset")
170283
parser.add_argument(
171284
"--compress-excerpts",
@@ -191,40 +304,26 @@ def main():
191304
action="append_const",
192305
const=step,
193306
)
194-
args = parser.parse_args()
195-
compilation_steps = args.steps # None or list of steps
307+
args = parser.parse_args(cmd_args)
308+
# if not specified, run all steps
309+
compilation_steps = args.steps if args.steps else list(compilation_steps.keys())
196310

197311
compile_opts = load_compilation_config()
198312

199-
if compilation_steps is None or "merge" in compilation_steps:
200-
print("## Merging excerpts")
201-
# find excerpt source files to be included in the compiled dataset file
202-
excerpt_sources = get_excerpt_sources(compile_opts["source_excerpt_data"])
203-
# merge into a single uncompressed csv
204-
# (polars doesn't currently support writing directly to a csv.gz)
205-
merge_excerpt_files(excerpt_sources, compile_opts["compiled_excerpt_file"])
206-
# compress the resulting file if requested
207-
if args.compress_excerpts:
208-
print(
209-
f"Compressing excerpt data... ({compile_opts['compiled_excerpt_file']}{compile_opts['compressed_excerpt_file']})"
210-
)
211-
compress_file(
212-
compile_opts["compiled_excerpt_file"],
213-
compile_opts["compressed_excerpt_file"],
214-
)
313+
excerpts_df = None
314+
if "merge" in compilation_steps:
315+
excerpts_df = run_merge_step(compile_opts, excerpts_df, args.compress_excerpts)
215316

216-
if compilation_steps is None or "poem_metadata" in compilation_steps:
217-
print("\n## Compiling reference corpora metadata")
218-
save_poem_metadata(compile_opts["poem_metadata_file"])
317+
if "poem_metadata" in compilation_steps:
318+
run_poem_metadata_step(compile_opts, excerpts_df)
219319

220-
if compilation_steps is None or "ppa_metadata" in compilation_steps:
221-
print("\n## PPA work-level metadata")
222-
save_ppa_metadata(
223-
compile_opts["source_ppa_metadata"], compile_opts["ppa_metadata_file"]
224-
)
320+
if "ppa_metadata" in compilation_steps:
321+
run_ppa_metadata_step(compile_opts, excerpts_df)
225322

226-
print("\nRemember to commit and push the updated data files")
227-
print(f"cd {compile_opts['output_data_dir'].parent} && git add data/*")
323+
# probably not relevant anymore, not using git-lfs for this data...
324+
print(f"Output files in {compile_opts['output_data_dir']}")
325+
# print("\nRemember to commit and push the updated data files")
326+
# print(f"cd {compile_opts['output_data_dir'].parent} && git add data/*")
228327

229328

230329
if __name__ == "__main__":

src/corppa/poetry_detection/merge_excerpts.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,9 @@ def merge_excerpts(
194194
return pl.concat([output_df, merged_output_df], how="diagonal")
195195

196196

197-
def merge_excerpt_files(input_files, output_file):
197+
def merge_excerpt_files(
198+
input_files: list[pathlib.Path], output_file: pathlib.Path
199+
) -> pl.DataFrame:
198200
total_excerpts = 0
199201
input_dfs = []
200202

@@ -259,7 +261,11 @@ def merge_excerpt_files(input_files, output_file):
259261
# row is a tuple of value, count
260262
print(f"\t{row[0]}: {row[1]:,}")
261263

264+
# polars supports compression; but not sure what version it
265+
# was added in, and documentation says it is unstable. Use that in future
262266
excerpts.write_csv(output_file)
267+
# return excerpt data frame
268+
return excerpts
263269

264270

265271
def main():

0 commit comments

Comments
 (0)