-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathjoin-demo.py
More file actions
71 lines (61 loc) · 2.78 KB
/
join-demo.py
File metadata and controls
71 lines (61 loc) · 2.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import argparse
import palimpzest as pz
# define columns for datasets
text_animal_cols = [
{"name": "animal", "type": str, "desc": "The type of animal mentioned in the text"},
{"name": "color", "type": str, "desc": "The color of the animal mentioned in the text"},
]
image_animal_cols = [
{"name": "animal", "type": str, "desc": "The type of animal in the image"},
{"name": "color", "type": str, "desc": "The color of the animal in the image"},
]
# query plans
def run_text_join():
"""Build a plan that joins two datasets"""
ds1 = pz.TextFileDataset(id="animals1", path="join-data/animal-texts/").sem_map(text_animal_cols)
ds2 = pz.TextFileDataset(id="animals2", path="join-data/animal-texts/").sem_map(text_animal_cols)
ds3 = ds1.sem_join(ds2, condition="both animals are canines with the same color")
config = pz.QueryProcessorConfig(
policy=pz.MaxQuality(),
execution_strategy="parallel",
join_parallelism=64,
)
data_record_collection = ds3.run(config)
print(data_record_collection.to_df())
def run_image_join():
"""Build a plan that joins two datasets with images"""
ds1 = pz.ImageFileDataset(id="animals1", path="join-data/animal-images/").sem_map(image_animal_cols)
ds2 = pz.ImageFileDataset(id="animals2", path="join-data/animal-images/").sem_map(image_animal_cols)
ds3 = ds1.sem_join(ds2, condition="both animals are canines with the same color")
config = pz.QueryProcessorConfig(
policy=pz.MaxQuality(),
execution_strategy="parallel",
join_parallelism=64,
)
data_record_collection = ds3.run(config)
print(data_record_collection.to_df())
def run_text_image_join():
"""Build a plan that joins a dataset with text to a dataset with images"""
ds1 = pz.TextFileDataset(id="animals1", path="join-data/animal-texts/").sem_map(text_animal_cols)
ds2 = pz.ImageFileDataset(id="animals2", path="join-data/animal-images/").sem_map(image_animal_cols)
ds3 = ds1.sem_join(ds2, condition="both animals are canines with the same color")
config = pz.QueryProcessorConfig(
policy=pz.MaxQuality(),
execution_strategy="parallel",
join_parallelism=64,
)
data_record_collection = ds3.run(config)
print(data_record_collection.to_df())
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run the Palimpzest join demo.")
parser.add_argument("--task", type=str, help="Which join demo to run")
args = parser.parse_args()
if args.task == "text-join":
run_text_join()
elif args.task == "image-join":
run_image_join()
elif args.task == "text-image-join":
run_text_image_join()
else:
print("Please provide a valid task: one of 'text-join', 'image-join', 'text-image-join'")
exit(1)