1111
1212To run one or more specific steps, specify which steps you want to run.
1313Any string that is distinct will be enough to select the step.
14+
1415```console
1516compile-dataset --merge
1617compile-dataset --poem-metadata
3031
3132from corppa .config import get_config
3233from corppa .poetry_detection .merge_excerpts import merge_excerpt_files
33-
34- # from corppa.utils.path_utils import find_relative_paths
34+ from corppa . poetry_detection . polars_utils import add_ref_poems_meta
35+ from corppa .poetry_detection . ppa_works import extract_page_meta
3536from corppa .poetry_detection .ref_corpora import save_poem_metadata
3637
3738DEFAULT_CONFIGS = {
3839 "source_excerpt_data" : "excerpt-data" ,
3940 "source_ppa_metadata" : "ppa-data/ppa_works.csv" ,
4041}
4142
43+ #: compile script config options, for run_step method type hints
44+ CompileOpts = dict [str , pathlib .Path ]
45+
4246
4347def load_compilation_config ():
4448 """Load configuration for dataset compilation,
@@ -132,31 +136,74 @@ def load_compilation_config():
132136 }
133137
134138
139+ def load_compiled_excerpts (config : CompileOpts ) -> pl .DataFrame :
140+ """Load compiled excerpts from CSV or compressed CSV file
141+ based on configured path, whichever file exists (uncompressed first).
142+ Raises a ValuError if neither file exists.
143+ """
144+ for datafile in [
145+ config ["compiled_excerpt_file" ],
146+ config ["compressed_excerpt_file" ],
147+ ]:
148+ if datafile .exists ():
149+ # extract ppa work id and page number (needed for both poem and ppa metadata)
150+ return extract_page_meta (pl .read_csv (datafile ))
151+ raise ValueError (
152+ f"Excerpt data file not found (checked { config ['compiled_excerpt_file' ]} and { config ['compressed_excerpt_file' ]} "
153+ )
154+
155+
135156def get_excerpt_sources (excerpt_data_dir : pathlib .Path ) -> list [pathlib .Path ]:
157+ """
158+ Find all CSV and compressed CSV files in a directory.
159+ """
136160 return list (excerpt_data_dir .glob ("**/*.csv" )) + list (
137161 excerpt_data_dir .glob ("**/*.csv.gz" )
138162 )
139- # wondered about using find_relative_paths here, but we actually
140- # want non-relative paths and we need to handle a two-part extension
141- # return [
142- # excerpt_data_dir / rel_path
143- # for rel_path in find_relative_paths(excerpt_data_dir, exts=[".csv", ".gz"]) # can we assume .gz == .csv.gz ?
144- # ]
145163
146164
147- def save_ppa_metadata (input_file : pathlib .Path , output_file : pathlib .Path ):
165+ def save_ppa_metadata (
166+ input_file : pathlib .Path , output_file : pathlib .Path , excerpts_df : pl .DataFrame
167+ ):
168+ """
169+ Save PPA work metadata with work-level excerpt totals.
170+ Takes a PPA metadata file as input, a path for the output file,
171+ and a dataframe of merged excerpt data.
172+ Raises a ValueError if metadata file is not a CSV.
173+ """
148174 # copy as-is, do not rename or subset any fields
149175 # NOTE: currently assumes and only supports PPA metadata in csv format
150176 if input_file .suffix != ".csv" :
151177 raise ValueError (
152178 f"PPA metadata must be loaded as CSV, got { input_file .suffix .lstrip ('.' )} "
153179 )
154180 ppa_meta_df = pl .read_csv (input_file )
155- # TODO: add aggregate counts here
181+
182+ # get work-level aggregate excerpt totals
183+ excerpt_totals_df = excerpts_df .group_by ("ppa_work_id" ).agg (
184+ pl .col ("excerpt_id" ).n_unique ().alias ("num_excerpts" ),
185+ pl .col ("poem_id" ).n_unique ().alias ("num_poems" ),
186+ pl .col ("poem_author" ).n_unique ().alias ("num_poets" ),
187+ )
188+
189+ # combine the totals with ppa work metadata
190+ ppa_meta_df = ppa_meta_df .join (
191+ excerpt_totals_df , left_on = "work_id" , right_on = "ppa_work_id" , how = "left"
192+ ).with_columns (
193+ # fill any missing values with zeroes
194+ pl .col ("num_excerpts" ).fill_null (pl .lit (0 )),
195+ pl .col ("num_poems" ).fill_null (pl .lit (0 )),
196+ pl .col ("num_poets" ).fill_null (pl .lit (0 )),
197+ )
198+
156199 ppa_meta_df .write_csv (output_file )
157200
158201
159- def compress_file (uncompressed_file , compressed_file ):
202+ def compress_file (uncompressed_file : pathlib .Path , compressed_file : pathlib .Path ):
203+ """
204+ Compress the `uncompressed_file` passed in with gzip,
205+ saving it at the `compressed_file` path and deleting the original.
206+ """
160207 with open (str (uncompressed_file ), "rb" ) as inputfile :
161208 with gzip .open (str (compressed_file ), "wb" ) as output_file :
162209 shutil .copyfileobj (inputfile , output_file )
@@ -165,7 +212,73 @@ def compress_file(uncompressed_file, compressed_file):
165212 uncompressed_file .unlink ()
166213
167214
168- def main ():
215+ def run_merge_step (
216+ compile_opts : CompileOpts , excerpts_df : pl .DataFrame | None , compress_excerpts : bool
217+ ) -> pl .DataFrame :
218+ """Run the merge excerpts step. Finds source excerpt files from the configured
219+ path, merges excerpts, saves to CSV, and optionally compresses the CSV file.
220+ """
221+ print ("## Merging excerpts" )
222+ excerpt_sources = get_excerpt_sources (compile_opts ["source_excerpt_data" ])
223+ excerpts_df = merge_excerpt_files (
224+ excerpt_sources , compile_opts ["compiled_excerpt_file" ]
225+ )
226+ if compress_excerpts :
227+ print (
228+ f"Compressing excerpt data... { compile_opts ['compiled_excerpt_file' ]} → { compile_opts ['compressed_excerpt_file' ]} "
229+ )
230+ compress_file (
231+ compile_opts ["compiled_excerpt_file" ],
232+ compile_opts ["compressed_excerpt_file" ],
233+ )
234+ return excerpts_df
235+
236+
237+ def run_poem_metadata_step (
238+ compile_opts : CompileOpts , excerpts_df : pl .DataFrame | None = None
239+ ) -> None :
240+ """Run the poem metadata compilation step. Uses excerpt data
241+ (passed in or loaded from compile opts path) to calculate
242+ poem excerpt totals.
243+ """
244+ print ("\n ## Compiling reference corpora metadata" )
245+ if excerpts_df is None :
246+ excerpts_df = load_compiled_excerpts (compile_opts )
247+ else :
248+ excerpts_df = extract_page_meta (excerpts_df )
249+ save_poem_metadata (compile_opts ["poem_metadata_file" ], excerpts_df )
250+
251+
252+ def run_ppa_metadata_step (
253+ compile_opts : CompileOpts , excerpts_df : pl .DataFrame | None = None
254+ ) -> None :
255+ """Run the PPA metadata compilation step. Uses excerpt data (passed
256+ in or loaded from compile opts path) to calculate work-level
257+ excerpt totals.
258+ """
259+ print ("\n ## PPA work-level metadata" )
260+ if excerpts_df is None :
261+ excerpts_df = load_compiled_excerpts (compile_opts )
262+ else :
263+ excerpts_df = extract_page_meta (excerpts_df )
264+
265+ excerpts_df = add_ref_poems_meta (excerpts_df , compile_opts ["poem_metadata_file" ])
266+
267+ save_ppa_metadata (
268+ compile_opts ["source_ppa_metadata" ],
269+ compile_opts ["ppa_metadata_file" ],
270+ excerpts_df ,
271+ )
272+
273+
274+ def main (cmd_args = None ) -> None :
275+ """
276+ Main entry point for the dataset compilation script. Parses
277+ arguments to determine which steps to run.
278+ """
279+ # allow passing arguments in; if not specified, draw from sys.argv/command line
280+ if cmd_args is None :
281+ cmd_args = sys .argv [1 :]
169282 parser = argparse .ArgumentParser (description = "Compile PPA found-poems dataset" )
170283 parser .add_argument (
171284 "--compress-excerpts" ,
@@ -191,40 +304,26 @@ def main():
191304 action = "append_const" ,
192305 const = step ,
193306 )
194- args = parser .parse_args ()
195- compilation_steps = args .steps # None or list of steps
307+ args = parser .parse_args (cmd_args )
308+ # if not specified, run all steps
309+ compilation_steps = args .steps if args .steps else list (compilation_steps .keys ())
196310
197311 compile_opts = load_compilation_config ()
198312
199- if compilation_steps is None or "merge" in compilation_steps :
200- print ("## Merging excerpts" )
201- # find excerpt source files to be included in the compiled dataset file
202- excerpt_sources = get_excerpt_sources (compile_opts ["source_excerpt_data" ])
203- # merge into a single uncompressed csv
204- # (polars doesn't currently support writing directly to a csv.gz)
205- merge_excerpt_files (excerpt_sources , compile_opts ["compiled_excerpt_file" ])
206- # compress the resulting file if requested
207- if args .compress_excerpts :
208- print (
209- f"Compressing excerpt data... ({ compile_opts ['compiled_excerpt_file' ]} → { compile_opts ['compressed_excerpt_file' ]} )"
210- )
211- compress_file (
212- compile_opts ["compiled_excerpt_file" ],
213- compile_opts ["compressed_excerpt_file" ],
214- )
313+ excerpts_df = None
314+ if "merge" in compilation_steps :
315+ excerpts_df = run_merge_step (compile_opts , excerpts_df , args .compress_excerpts )
215316
216- if compilation_steps is None or "poem_metadata" in compilation_steps :
217- print ("\n ## Compiling reference corpora metadata" )
218- save_poem_metadata (compile_opts ["poem_metadata_file" ])
317+ if "poem_metadata" in compilation_steps :
318+ run_poem_metadata_step (compile_opts , excerpts_df )
219319
220- if compilation_steps is None or "ppa_metadata" in compilation_steps :
221- print ("\n ## PPA work-level metadata" )
222- save_ppa_metadata (
223- compile_opts ["source_ppa_metadata" ], compile_opts ["ppa_metadata_file" ]
224- )
320+ if "ppa_metadata" in compilation_steps :
321+ run_ppa_metadata_step (compile_opts , excerpts_df )
225322
226- print ("\n Remember to commit and push the updated data files" )
227- print (f"cd { compile_opts ['output_data_dir' ].parent } && git add data/*" )
323+ # probably not relevant anymore, not using git-lfs for this data...
324+ print (f"Output files in { compile_opts ['output_data_dir' ]} " )
325+ # print("\nRemember to commit and push the updated data files")
326+ # print(f"cd {compile_opts['output_data_dir'].parent} && git add data/*")
228327
229328
230329if __name__ == "__main__" :
0 commit comments