Skip to content
This repository was archived by the owner on Sep 28, 2022. It is now read-only.

Commit 0dfe44a

Browse files
authored
Merge pull request #146 from yuce/multi-row-csv-example
Added multiple column CSV import example
2 parents 6f195c1 + 770f992 commit 0dfe44a

File tree

4 files changed

+130
-0
lines changed

4 files changed

+130
-0
lines changed
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# Pilosa Multi-field CSV Import
2+
3+
## Prerequisites
4+
5+
* Python 3.6 or better
6+
7+
## Install
8+
9+
Create a virtual environment:
10+
11+
$ python3 -m venv ve
12+
13+
Activate the virtual environment:
14+
15+
$ source ve/bin/activate
16+
17+
Install requirements:
18+
19+
$ pip install -r requirements.txt
20+
21+
## Usage
22+
23+
* Update `import.py` so it matches the contents of the CSV file,
24+
* Run it with the Pilosa address (by default: `localhost:10101`) and name of the CSV file:
25+
26+
$ python import.py :10101 sample.csv
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
#! /usr/bin/env python3
2+
3+
import sys
4+
import threading
5+
6+
from pilosa import Client, Schema
7+
from pilosa.imports import Column, FieldValue
8+
9+
# adapt these to match the CSV file
10+
INDEX_NAME = "my-index"
11+
INDEX_KEYS = True
12+
FIELDS = [
13+
{"name": "size", "opts": {"keys": True}},
14+
{"name": "color", "opts": {"keys": True}},
15+
{"name": "age", "opts": {"int_min": 0, "int_max": 150}},
16+
]
17+
# -----------------------------
18+
19+
class MultiColumnBitIterator:
20+
21+
def __init__(self,
22+
file_obj, field,
23+
column_index=0, row_index=1,
24+
has_header=True):
25+
self.file_obj = file_obj
26+
if has_header:
27+
# if there's a header skip it
28+
next(self.file_obj)
29+
30+
ci = column_index
31+
ri = row_index
32+
33+
# set the bit yielder
34+
if field.field_type == "int":
35+
if field.index.keys:
36+
self.yield_fun = lambda fs: FieldValue(column_key=fs[ci], value=int(fs[ri]))
37+
else:
38+
self.yield_fun = lambda fs: FieldValue(column_id=int(fs[ci]), value=int(fs[ri]))
39+
else:
40+
if field.index.keys:
41+
if field.keys:
42+
self.yield_fun = lambda fs: Column(column_key=fs[ci], row_key=fs[ri] )
43+
else:
44+
self.yield_fun = lambda fs: Column(column_key=fs[ci], row_id=int(fs[ri]))
45+
else:
46+
if field.keys:
47+
self.yield_fun = lambda fs: Column(column_id=int(fs[ci]), row_key=fs[ri] )
48+
else:
49+
self.yield_fun = lambda fs: Column(column_id=int(fs[ci]), row_id=int(fs[ri]))
50+
51+
def __call__(self):
52+
yield_fun = self.yield_fun
53+
for line in self.file_obj:
54+
# skip empty lines
55+
line = line.strip()
56+
if not line:
57+
continue
58+
# split fields
59+
fs = [x.strip() for x in line.split(",")]
60+
# return a bit
61+
yield yield_fun(fs)
62+
63+
def import_field(client, field, path, row_index):
64+
with open(path) as f:
65+
mcb = MultiColumnBitIterator(f, field, row_index=row_index)
66+
client.import_field(field, mcb())
67+
68+
def import_csv(pilosa_addr, path):
69+
client = Client(pilosa_addr)
70+
71+
# create the schema
72+
schema = Schema()
73+
index = schema.index(INDEX_NAME, keys=INDEX_KEYS, track_existence=True)
74+
fields = [index.field(field["name"], **field["opts"]) for field in FIELDS]
75+
client.sync_schema(schema)
76+
77+
# import each field
78+
threads = []
79+
for i, field in enumerate(fields):
80+
t = threading.Thread(target=import_field, args=(client, field, path, i + 1))
81+
t.start()
82+
threads.append(t)
83+
84+
for t in threads:
85+
t.join()
86+
87+
88+
def main():
89+
if len(sys.argv) != 3:
90+
print(f"Usage: {sys.argv[0]} pilosa_address csv_file")
91+
sys.exit(1)
92+
93+
pilosa_addr = sys.argv[1]
94+
path = sys.argv[2]
95+
import_csv(pilosa_addr, path)
96+
97+
if __name__ == "__main__":
98+
main()
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
pilosa==1.3.1
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
ID,Size,Color,Age
2+
ABDJ,small,green,42
3+
HFZP,large,red,99
4+
EJSK,medium,purple,22
5+
EJSK,large,green,35

0 commit comments

Comments
 (0)