From a6c87c837fa89688b18533f1e027c91dba40ef13 Mon Sep 17 00:00:00 2001
From: eugenrmain <eugen.ranow2005@gmail.com>
Date: Tue, 25 Mar 2025 18:09:57 +0100
Subject: [PATCH 1/2] Improved the scraping of contacts by removing the
 reliance of BeatifulSoup and using Selenium instead for actually clicking the
 services, thus excerting the contact information properly. Also added the
 latest imports in requirements.txt

---
 google_maps_scraper.py | 175 +++++++++++++++++++++--------------------
 requirements.txt       |   6 +-
 2 files changed, 94 insertions(+), 87 deletions(-)

diff --git a/google_maps_scraper.py b/google_maps_scraper.py
index 2d64ba0..62d3421 100644
--- a/google_maps_scraper.py
+++ b/google_maps_scraper.py
@@ -1,6 +1,5 @@
 import subprocess
 import pandas as pd
-from bs4 import BeautifulSoup
 from selenium import webdriver
 from selenium.webdriver.common.by import By
 from selenium.webdriver.common.keys import Keys
@@ -13,8 +12,8 @@
 
 # Setup and initial configurations
 URL = "https://www.google.com/maps"
-service = "ENTER A SERVICE OR A NAME"  # e.g. catering, events, etc. OR starbucks, mcdonalds, etc.
-location = "ENTER LOCATION"  # e.g. London, Germany, etc.
+service = "Salon"  # e.g. catering, events, etc.
+location = "Stockholm"  # e.g. London, Germany, etc.
 
 print("Starting the web scraping script...")
 
@@ -27,112 +26,120 @@
 # Accept cookies
 try:
     print("Looking for accept cookies button...")
-    accept_cookies = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="yDmH0d"]/c-wiz/div/div/div/div[2]/div[1]/div[3]/div[1]/div[1]/form[2]/div/div/button')))
+    accept_cookies = WebDriverWait(driver, 10).until(
+        EC.element_to_be_clickable((By.XPATH, '//*[@id="yDmH0d"]/c-wiz/div/div/div/div[2]/div[1]/div[3]/div[1]/div[1]/form[2]/div/div/button'))
+    )
     accept_cookies.click()
     print("Accepted cookies.")
-except NoSuchElementException:
-    print("No accept cookies button found.")
+except Exception:
+    print("No accept cookies button found or already accepted.")
 
 # Search for results
 print(f"Searching for: {service} in {location}")
-input_field = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="searchboxinput"]')))
-input_field.send_keys(service.lower() + ' ' + location.lower())
+input_field = WebDriverWait(driver, 10).until(
+    EC.element_to_be_clickable((By.XPATH, '//*[@id="searchboxinput"]'))
+)
+input_field.send_keys(f"{service} {location}")
 input_field.send_keys(Keys.ENTER)
 print("Search submitted.")
 
-# Wait for the sidebar to load
-print("Waiting for the sidebar to load...")
-divSideBar = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, f"div[aria-label*='{service.lower()} {location.lower()}']")))
+# Wait for results to load
+time.sleep(5)
 
-# Scroll through the results
-print("Scrolling the sidebar to load all of the results...")
-previous_scroll_height = driver.execute_script("return arguments[0].scrollHeight", divSideBar)
+# Scroll and collect clickable result cards
+print("Scrolling to load all business listings...")
+scrollable_div_xpath = '//div[@role="feed"]'
+scrollable_div = WebDriverWait(driver, 10).until(
+    EC.presence_of_element_located((By.XPATH, scrollable_div_xpath))
+)
+
+last_height = driver.execute_script("return arguments[0].scrollHeight", scrollable_div)
 while True:
-    driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", divSideBar)
-    time.sleep(3)
-    new_scroll_height = driver.execute_script("return arguments[0].scrollHeight", divSideBar)
-    if new_scroll_height == previous_scroll_height:
+    driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", scrollable_div)
+    time.sleep(2)
+    new_height = driver.execute_script("return arguments[0].scrollHeight", scrollable_div)
+    if new_height == last_height:
         break
-    previous_scroll_height = new_scroll_height
-print("Finished scrolling.")
-
-# Parse the page source
-print("Parsing the page source...")
-page_source = driver.page_source
-driver.quit()
+    last_height = new_height
 
-soup = BeautifulSoup(page_source, "html.parser")
-boxes = soup.find_all('div', class_='Nv2PK')
+print("Finished scrolling. Collecting business profiles...")
+business_cards = driver.find_elements(By.CLASS_NAME, "Nv2PK")
 
-# Collect data
-print("Collecting data...")
 data = []
 
-for box in boxes:
-    # Business name
-    try:
-        business_name = box.find('div', class_='qBF1Pd').getText()
-    except AttributeError:
-        business_name = "N/A"
-
-    # Address
-    try:
-        inner_div = box.find_all('div', class_='W4Efsd')[1].find('div', class_='W4Efsd')
-        address = [span.text for span in inner_div.find_all('span') if span.text and not span.find('span')][-1]
-    except (IndexError, AttributeError):
-        address = "N/A"
-
-    # Stars
-    try:
-        stars = box.find('span', class_='MW4etd').getText()
-    except AttributeError:
-        stars = "N/A"
-
-    # Number of reviews
-    try:
-        number_of_reviews = box.find('span', class_='UY7F9').getText().strip('()')
-    except AttributeError:
-        number_of_reviews = "N/A"
-
-    # Phone number
-    try:
-        phone_number = box.find('span', class_='UsdlK').getText()
-    except AttributeError:
-        phone_number = "N/A"
-
-    # Website
+for i, card in enumerate(business_cards):
     try:
-        website = box.find('a', class_='lcr4fd').get('href')
-    except AttributeError:
-        website = "N/A"
-
-    # Append to data list
-    data.append({
-        'Business Name': business_name,
-        'Address': address,
-        'Stars': stars,
-        'Number of Reviews': number_of_reviews,
-        'Phone Number': phone_number,
-        'Website': website,
-        'Email': ' ',
-    })
-
-# Create a DataFrame and save to Excel
+        print(f"Processing business {i+1}/{len(business_cards)}")
+        driver.execute_script("arguments[0].scrollIntoView();", card)
+        time.sleep(1)
+        card.click()
+        time.sleep(3)  # Let the profile panel load
+
+        # Collect business data from the profile panel
+        try:
+            name = driver.find_element(By.CLASS_NAME, "DUwDvf").text
+        except:
+            name = "N/A"
+
+        try:
+            address = driver.find_element(By.XPATH, "//button[contains(@data-item-id, 'address')]//div[2]/div[1]").text
+        except:
+            address = "N/A"
+
+        try:
+            phone = driver.find_element(By.XPATH, "//button[contains(@data-item-id, 'phone')]//div[2]/div[1]").text
+        except:
+            phone = "N/A"
+
+        try:
+            website = driver.find_element(By.XPATH, "//a[contains(@data-item-id, 'authority')]" ).get_attribute("href")
+        except:
+            website = "N/A"
+
+        try:
+            stars = driver.find_element(By.CLASS_NAME, "F7nice").text
+        except:
+            stars = "N/A"
+
+        try:
+            reviews = driver.find_element(By.CLASS_NAME, "UY7F9").text.strip("()")
+        except:
+            reviews = "N/A"
+
+        data.append({
+            'Business Name': name,
+            'Address': address,
+            'Stars': stars,
+            'Number of Reviews': reviews,
+            'Phone Number': phone,
+            'Website': website,
+            'Email': ' ',
+        })
+
+        # Back to the results panel
+        time.sleep(2)
+        back_button = driver.find_element(By.CLASS_NAME, "RVQdVd")
+        if back_button:
+            back_button.click()
+            time.sleep(2)
+
+    except Exception as e:
+        print(f"Error processing card {i+1}: {e}")
+        continue
+
+# Save to Excel
 excel_file = f'{location}_{service}.xlsx'
 df = pd.DataFrame(data)
 df.to_excel(excel_file, index=False)
 
 print(f"Data has been saved to {excel_file}")
 
-# Create a configuration file
-config = {
-    'excel_file': excel_file
-}
+# Save config
 with open('config.json', 'w') as config_file:
-    json.dump(config, config_file)
+    json.dump({ 'excel_file': excel_file }, config_file)
 print("Configuration file created: config.json")
 
-# Call the email extraction script
+# Run the email extraction script
 print("Calling the email extraction script...")
 subprocess.run(['python', 'email_extraction_script.py'])
 print("Email extraction script completed.")
diff --git a/requirements.txt b/requirements.txt
index 5acf5d6..021b6cd 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,3 @@
-selenium==4.16.1
-pandas==2.0.2
-beautifulsoup4==4.12.2
+selenium
+pandas
+beautifulsoup4

From cf50fc326cecd468cbb280535aa02fa5b245eabf Mon Sep 17 00:00:00 2001
From: eugenrmain <eugen.ranow2005@gmail.com>
Date: Tue, 25 Mar 2025 18:11:58 +0100
Subject: [PATCH 2/2] Improved the scraping of contacts by removing the
 reliance of BeatifulSoup and using Selenium instead for actually clicking the
 services, thus excerting the contact information properly. Also added the
 latest imports in requirements.txt

---
 google_maps_scraper.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/google_maps_scraper.py b/google_maps_scraper.py
index 62d3421..c13739b 100644
--- a/google_maps_scraper.py
+++ b/google_maps_scraper.py
@@ -12,8 +12,8 @@
 
 # Setup and initial configurations
 URL = "https://www.google.com/maps"
-service = "Salon"  # e.g. catering, events, etc.
-location = "Stockholm"  # e.g. London, Germany, etc.
+service = "SERVICE"  # e.g. catering, events, etc.
+location = "LOCATION"  # e.g. London, Germany, etc.
 
 print("Starting the web scraping script...")