-
-
Notifications
You must be signed in to change notification settings - Fork 669
Expand file tree
/
Copy pathgetweblinks.py
More file actions
54 lines (41 loc) · 1.67 KB
/
getweblinks.py
File metadata and controls
54 lines (41 loc) · 1.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from modules.net_utils import get_urls_from_page, get_url_status
from modules import pagereader
from bs4 import BeautifulSoup
from modules.bcolors import Bcolors
def add_green(link):
colors = Bcolors()
return '\t' + colors.OKGREEN + link + colors.ENDC
def add_red(link):
colors = Bcolors()
return '\t' + colors.On_Red + link + colors.ENDC
def get_links(soup, ext=False, live=False):
"""
Searches through all <a ref> (hyperlinks) tags and stores them in a
list then validates if the url is formatted correctly.
Args:
soup: BeautifulSoup instance currently being used.
Returns:
websites: List of websites that were found
"""
b_colors = Bcolors()
if isinstance(soup, BeautifulSoup):
websites = get_urls_from_page(soup, extension=ext)
"""Pretty print output as below"""
print(''.join((b_colors.OKGREEN,
'Websites Found - ', b_colors.ENDC, str(len(websites)))))
print('------------------------------------')
if live:
for link in websites:
if get_url_status(link) != 0:
coloredlink = add_green(link)
page = pagereader.read_first_page(link)[0]
if page is not None and page.title is not None:
print_row(coloredlink, page.title.string)
else:
coloredlink = add_red(link)
print_row(coloredlink, "Not found")
return websites
else:
raise(Exception('Method parameter is not of instance BeautifulSoup'))
def print_row(url, description):
print("%-80s %-30s" % (url, description))