-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathload_all_images.py
More file actions
executable file
·216 lines (167 loc) · 5.74 KB
/
load_all_images.py
File metadata and controls
executable file
·216 lines (167 loc) · 5.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
#!/usr/bin/env python3.10
# Args:
## $1 - token in format "user:password" for API authentication
## $2 - dst folder with XXX sequence for replacing with backup number
## $3..$x - source csv files for searching & loading raw data from API
import csv
from datetime import datetime
import os
import re
import requests
from requests.auth import HTTPBasicAuth
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
import sys
import time
from urllib.parse import unquote
# Usually 1000 requests per 1 hour, but saving some jitter values
# https://support.atlassian.com/bitbucket-cloud/docs/api-request-limits/
BITBUCKET_RATE_LIMIT = 970
BITBUCKET_RATE_LIMIT_INTERVAL_SECONDS = 3600 + 60
API_PREFIX = "https://bitbucket.org/"
API_CONTAINS = "/images/"
PATTER_REPLACE_VALUE = "XXX"
IS_USE_SELENIUM = True
TIME_TO_AUTH_SECONDS = 45
def init():
try:
# see https://stackoverflow.com/a/15063941/6818663
csv.field_size_limit(sys.maxsize)
except OverflowError:
maxLong = (1 << 31) - 1
# Looks like Windows uses long instead of long long
csv.field_size_limit(maxLong)
# Enable single driver instance if need
if IS_USE_SELENIUM:
global SELENIUM_DRIVER
SELENIUM_DRIVER = webdriver.Firefox()
def deinit():
if IS_USE_SELENIUM:
global SELENIUM_DRIVER
SELENIUM_DRIVER.quit()
def parse_args(argv):
USER_PASS = argv[1]
userPassSplit = USER_PASS.split(':')
global AUTH
AUTH = HTTPBasicAuth(userPassSplit[0], userPassSplit[1])
global DST_FILE
DST_FILE = argv[2]
global SRC_FILES
SRC_FILES = []
for p in argv[3:]:
SRC_FILES.append(p)
def single_query_selenium_get_true_url(url):
global SELENIUM_DRIVER
SELENIUM_DRIVER.get(url)
rawImagePath = '//img'
try:
# get the image source
img = SELENIUM_DRIVER.find_element(By.XPATH, rawImagePath)
except NoSuchElementException:
print(f"Cannot find image. Try to log in that page. You have {TIME_TO_AUTH_SECONDS} seconds")
time.sleep(TIME_TO_AUTH_SECONDS)
# get the image source again
img = SELENIUM_DRIVER.find_element(By.XPATH, rawImagePath)
src = img.get_attribute('src')
return src
def single_query(url):
if IS_USE_SELENIUM:
url = single_query_selenium_get_true_url(url)
res = requests.get(url)
else:
res = requests.get(url, auth=AUTH)
res.raise_for_status()
return res.content
def load_csv_data():
res = []
for f in SRC_FILES:
with open(f, "r", encoding="utf8") as src:
inReader = csv.reader(src)
for row in inReader:
for r in row:
res.append(r)
return res
def select_only_urls(data):
urls = []
for d in data:
# URL match regex from https://uibakery.io/regex-library/url-regex-python
matches = re.findall(r'https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&\/=]*)', d)
for m in matches:
urls.append(m)
res = [d for d in urls if d.startswith(API_PREFIX) and API_CONTAINS in d]
for i, r in enumerate(res):
if r.endswith(')'):
# That is regex bug
res[i] = r[:-1]
# need only unique urls
res = list(set(res))
return res
def dump_results(path, obj):
try:
# recursive creation
os.makedirs(path, exist_ok=True)
for o in obj:
blob = obj[o]
fileName = unquote(o).replace(':', '_').replace('/', '_')
with open(path + os.path.sep + fileName, "wb") as f:
f.write(blob)
except Exception as e:
print(f"Exception was caught for results dumping to path '{path}'")
print(e)
print()
def load_data_from_urls_with_backup(urls):
step = 0
i = 0
# For saving results
res = {}
for d in urls:
if i == 0:
ts = time.time()
i += 1
try:
res[d] = single_query(d)
except requests.exceptions.HTTPError as e:
print(f"HTTP Exception was caught for url '{d}'")
print(f"HTTP code {e.response.status_code}")
print(e.response.text)
print()
if e.response.status_code == 401:
print("Cannot authorize on server. Bad credentials or no permissions or OAuth2 is required (not supported). Exiting...")
if not IS_USE_SELENIUM:
print("Try to enable Selenium")
print()
exit(1)
except NoSuchElementException as e:
print(f"Cannot find image element in Selenium for url {d}. Skipping it.")
print()
except Exception as e:
print(f"Exception was caught for url '{d}'")
print(e)
print()
if i >= BITBUCKET_RATE_LIMIT:
i = 0
step += 1
# temp saving
resFileName = DST_FILE.replace(PATTER_REPLACE_VALUE, str(step))
dump_results(resFileName, res)
print("Dump", resFileName, "was written")
newTs = ts + BITBUCKET_RATE_LIMIT_INTERVAL_SECONDS
tsDiff = newTs - time.time()
print(datetime.now(), "Will get up in", tsDiff, "seconds")
time.sleep(tsDiff)
# saving full results
resFileName = DST_FILE.replace(PATTER_REPLACE_VALUE, 'final')
dump_results(resFileName, res)
print("Final dump", resFileName, "was written")
def main(argv):
try:
init()
parse_args(argv)
data = load_csv_data()
urls = select_only_urls(data)
load_data_from_urls_with_backup(urls)
finally:
deinit()
if __name__ == '__main__':
main(sys.argv)