This article shows how I scraped reviews from Google Play, Yelp, and Trustpilot using Python. Each platform required a different method:
- Trustpilot company pages: requests + BeautifulSoup with multiple selectors for changing HTML.
- Yelp businesses: Selenium with an anti-detection setup to bypass strong bot protection.
- Google Play Store apps: Selenium, combined with the Google Play Scraper library, for fast and structured results.
Learn how to handle anti-detection, parse customer review data, and save the scraped data in CSV, JSON, or export to a database like Google Sheets for data analysis.
How to Scrape Google Play Reviews with Python & Selenium
Step 1: Setting up anti-detection
Scraping Google Play reviews requires hiding automation signals. The script configures Chrome with disabled automation flags, a custom user agent, and a fixed window size.
Start with headless=False to monitor the browser, then switch to True once stable.
import time
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
class PlayStoreReviewScraper:
def __init__(self, headless=False):
chrome_options = Options()
if headless:
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option("useAutomationExtension", False)
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
chrome_options.add_argument("--window-size=1920,1080")
self.driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()),
options=chrome_options
)
self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
self.wait = WebDriverWait(self.driver, 15)
Part 2: Searching for apps
This function searches the Play Store for apps matching your search term and extracts app names and URLs.
def search_apps(self, search_term, max_apps=3):
search_url = f"https://play.google.com/store/search?q={search_term.replace(' ', '+')}&c=apps"
print(f"Searching Play Store: {search_term}")
print(f"URL: {search_url}\n")
self.driver.get(search_url)
time.sleep(5)
apps = []
try:
app_elements = self.driver.find_elements(By.CSS_SELECTOR, 'a[href*="/store/apps/details?id="]')
print(f"Found {len(app_elements)} app links\n")
for element in app_elements[:max_apps * 3]:
try:
href = element.get_attribute('href')
if not href or 'details?id=' not in href:
continue
app_id = href.split('id=')[1].split('&')[0]
try:
parent = element.find_element(By.XPATH, './parent::*')
name_elem = parent.find_element(By.TAG_NAME, 'span')
app_name = name_elem.text.strip()
except:
try:
name_elem = element.find_element(By.XPATH, '//span')
app_name = name_elem.text.strip()
except:
app_name = app_id
if app_name and len(app_name) > 2 and app_name != app_id:
apps.append({
'name': app_name,
'app_id': app_id,
'url': f"https://play.google.com/store/apps/details?id={app_id}"
})
print(f" Found #{len(apps)}: {app_name}")
if len(apps) >= max_apps:
break
except:
continue
except Exception as e:
print(f"Error searching apps: {e}")
print(f"\nTotal {len(apps)} apps found!\n")
return apps
The function builds a search URL with &c=apps to filter apps. We look for links containing /store/apps/details?id=, which is Google Play’s URL pattern. The app ID comes from the URL, and the app name is pulled from a parent span tag.
If no name is found, we fall back to the app ID. The max_apps parameter controls the number of apps to scrape (default: 3, but adjustable to 5, 10, or more). The search_term defines the category, e.g., “thrift shopping,” “fitness tracking,” or “photo editing.”
Step 3: Extracting reviews
This function navigates to an app page, clicks “See all reviews”, scrolls to load more reviews, and extracts customer review data.
def scrape_reviews(self, app_url, max_reviews=20):
print(f"Scraping reviews: {app_url}")
self.driver.get(app_url)
time.sleep(4)
reviews = []
try:
see_all_button = self.wait.until(
EC.element_to_be_clickable((By.XPATH, "//span[contains(text(), 'See all reviews') or contains(text(), 'all reviews')]"))
)
see_all_button.click()
print(" Clicked 'See all reviews'")
time.sleep(5)
self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[data-review-id]')))
print(" Reviews loaded")
except:
print(" Could not find 'See all reviews' button, trying to scroll...")
last_review_count = 0
scroll_attempts = 0
max_scroll_attempts = 5
while len(reviews) < max_reviews and scroll_attempts < max_scroll_attempts:
try:
review_elements = self.driver.find_elements(By.CSS_SELECTOR, '.RHo1pe')
if not review_elements:
review_elements = self.driver.find_elements(By.CSS_SELECTOR, 'div[data-review-id]')
if len(review_elements) == last_review_count:
scroll_attempts += 1
else:
scroll_attempts = 0
last_review_count = len(review_elements)
for review_elem in review_elements:
if len(reviews) >= max_reviews:
print(f" Reached {max_reviews} reviews, stopping...")
break
review_id = review_elem.get_attribute('data-review-id') or str(len(reviews))
if any(r['review_id'] == review_id for r in reviews):
continue
try:
expand_button = review_elem.find_element(By.XPATH, ".//button[contains(@aria-label, 'See more')]")
self.driver.execute_script("arguments[0].click();", expand_button)
time.sleep(0.5)
except:
pass
user_name = 'Anonymous'
try:
user_elem = review_elem.find_element(By.CSS_SELECTOR, '.X5PpBb')
user_name = user_elem.text.strip()
except:
pass
rating = 'N/A'
try:
rating_elem = review_elem.find_element(By.CSS_SELECTOR, '.iXRFPc')
rating_text = rating_elem.get_attribute('aria-label')
if rating_text and 'Rated' in rating_text:
rating = rating_text.split()[1]
except:
pass
date = 'N/A'
try:
date_elem = review_elem.find_element(By.CSS_SELECTOR, '.bp9Aid')
date = date_elem.text.strip()
except:
pass
review_text = ""
try:
text_elem = review_elem.find_element(By.CSS_SELECTOR, '.h3YV2d')
review_text = text_elem.text.strip()
except:
pass
if review_text and len(review_text) > 10:
reviews.append({
'review_id': review_id,
'user_name': user_name,
'rating': rating,
'date': date,
'text': review_text
})
print(f" Review #{len(reviews)}: {user_name} - {rating} stars")
except Exception as e:
print(f" Scroll error: {e}")
break
print(f" Total {len(reviews)} reviews scraped\n")
return reviews
The function clicks “See all reviews” if available, then waits for review elements. If the button is missing, it falls back to scrolling. Reviews are collected until the limit is reached or five scrolls return no new data.
The scraper extracts review ID, username, rating, date, and full review text, expanding truncated reviews when possible.
CSS selectors:
- .RHo1pe → review container
- .X5PpBb → username
- .iXRFPc → rating (aria-label)
- .bp9Aid → date
- .h3YV2d → review text
The max_reviews parameter controls the number of reviews (default 20, adjustable to 50, 100, or more).
Step 4: Putting it all together
These functions combine the app search and customer review scraping workflow, then save results to CSV.
def scrape_multiple_apps(self, search_term, max_apps=3, reviews_per_app=10):
apps = self.search_apps(search_term, max_apps)
if not apps:
print("No apps found!")
return []
all_results = []
for i, app in enumerate(apps, 1):
print(f"\n[{i}/{len(apps)}] Processing {app['name']}...")
print("-" * 60)
try:
reviews = self.scrape_reviews(app['url'], max_reviews=reviews_per_app)
for review in reviews:
review['app_name'] = app['name']
review['app_id'] = app['app_id']
review['app_url'] = app['url']
all_results.append(review)
time.sleep(3)
except Exception as e:
print(f" Error: {str(e)}")
continue
return all_results
def save_to_csv(self, reviews, filename="playstore_reviews.csv"):
if not reviews:
print("No reviews to save!")
return
fieldnames = ['app_name', 'app_id', 'app_url', 'user_name', 'rating', 'date', 'text', 'review_id']
with open(filename, 'w', newline="", encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(reviews)
print(f"\n{'='*60}")
print(f"SUCCESS: {len(reviews)} reviews saved to '{filename}'")
print(f"{'='*60}")
def close(self):
self.driver.quit()
Example run
The workflow searches for apps, scrapes reviews, and saves them to a CSV. Each review is tagged with the app name, ID, and URL. A three-second delay between apps prevents rate limiting.
With the default settings (3 apps × 10 reviews), the script collects about 30 reviews in 3–4 minutes. The CSV includes app name, ID, URL, username, rating, date, review text, and review ID.
if __name__ == "__main__":
scraper = PlayStoreReviewScraper(headless=False)
try:
print("=" * 60)
print("GOOGLE PLAY STORE REVIEW SCRAPER")
print("=" * 60)
reviews = scraper.scrape_multiple_apps(
search_term="thrift shopping",
max_apps=3,
reviews_per_app=10
)
print(f"\n{'='*60}")
print(f"SUMMARY: {len(reviews)} total reviews scraped!")
print(f"{'='*60}\n")
for i, review in enumerate(reviews[:3], 1):
print(f"Review #{i}:")
print(f"App: {review['app_name']}")
print(f"User: {review['user_name']}")
print(f"Rating: {review['rating']}")
print(f"Review: {review['text'][:80]}...")
print("-" * 60)
if reviews:
scraper.save_to_csv(reviews, "playstore_secondhand_reviews.csv")
finally:
scraper.close()
How to scrape Yelp reviews with Python (No API)
Step 1: Setting up anti-Detection
When we first attempted to scrape Yelp using basic Selenium, we immediately encountered a CAPTCHA. Yelp detects automation signals, so basic setups fail.
Through testing, we found Yelp checks specific browser properties. For example, the navigator.webdriver property in JavaScript returns true when Selenium is active. Chrome’s automation flags and the user-agent string can also reveal automation.
Here’s the setup that worked:
import time
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
class YelpReviewScraper:
def __init__(self, headless=False):
chrome_options = Options()
if headless:
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option("useAutomationExtension", False)
chrome_options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36")
chrome_options.add_argument("--window-size=1920,1080")
self.driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()),
options=chrome_options
)
self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
self.wait = WebDriverWait(self.driver, 15)
The key breakthrough was redefining the navigator.webdriver property. By overriding it to return undefined, Selenium is no longer detected. Combined with a custom user-agent string, this makes the browser appear more like a real user.
Start with headless=False to monitor the run. Once stable, switch to True for faster background scraping.
Step 2: Searching and finding businesses
Yelp’s search results load progressively, and the HTML uses dynamic class names that change often. This breaks selectors based on exact classes.
Our first attempts captured irrelevant links like “See more reviews” and “View menu”. Filtering was added to target only real business links.
def search_businesses(self, search_term, location, max_businesses=10):
search_url = f"https://www.yelp.com/search?find_desc={search_term.replace(' ', '+')}&find_loc={location.replace(' ', '+')}"
print(f"Searching: {search_term} in {location}")
print(f"URL: {search_url}\n")
self.driver.get(search_url)
time.sleep(5)
try:
title = self.driver.title
if "captcha" in title.lower():
print("CAPTCHA detected!")
input("Solve CAPTCHA and press ENTER...")
except:
pass
businesses = []
scroll_count = 0
max_scrolls = 5
while len(businesses) < max_businesses and scroll_count < max_scrolls:
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(3)
business_elements = self.driver.find_elements(By.CSS_SELECTOR, 'a[href*="/biz/"]')
print(f"Scroll #{scroll_count + 1}: Checking {len(business_elements)} links...")
for element in business_elements:
if len(businesses) >= max_businesses:
break
try:
href = element.get_attribute('href')
text = element.text.strip()
if not href or '/biz/' not in href:
continue
if any(skip in text.lower() for skip in ['see more', 'read more', 'view', 'show', 'all reviews', 'photo', 'menu']):
continue
clean_url = href.split('?')[0].split('#')[0]
if clean_url in [b['url'] for b in businesses]:
continue
name = text if len(text) >= 3 else clean_url.split('/biz/')[-1].replace('-', ' ').title()
businesses.append({
'name': name,
'url': clean_url
})
print(f" Found #{len(businesses)}: {name}")
except:
continue
scroll_count += 1
print(f"\nTotal {len(businesses)} businesses found!\n")
return businesses
We scroll multiple times since Yelp loads results progressively. The scraper collects links containing /biz/, Yelp’s URL pattern for business pages. Filtering ensures only valid businesses are included, while duplicates and names shorter than three characters are skipped.
The max_businesses parameter controls how many results are scraped. Start with 3 for testing, then increase once stable.
Step 3: Extracting reviews
Yelp’s biggest challenge is unstable HTML. Standard CSS selectors often failed or returned the wrong elements.
By inspecting with Chrome DevTools, we found reviews inside paragraph tags with class names containing “comment”. Within them, span tags containing “raw” hold the actual review text. This pattern stays consistent even when class names change slightly.
def scrape_reviews(self, business_url, max_reviews=20):
print(f"Scraping reviews: {business_url}")
self.driver.get(business_url)
time.sleep(4)
reviews = []
for i in range(3):
self.driver.execute_script(f"window.scrollTo(0, {1000 * (i+1)});")
time.sleep(1.5)
try:
comment_elements = self.driver.find_elements(By.CSS_SELECTOR, 'p[class*="comment"]')
print(f" Found {len(comment_elements)} review texts")
for comment_elem in comment_elements[:max_reviews]:
try:
text_span = comment_elem.find_element(By.CSS_SELECTOR, 'span[class*="raw"]')
review_text = text_span.text.strip()
if not review_text or len(review_text) < 20:
continue
user_name = "Anonymous"
try:
review_container = comment_elem.find_element(By.XPATH, './ancestor::li[1]')
user_links = review_container.find_elements(By.CSS_SELECTOR, 'a[href*="/user_details"]')
if user_links:
user_name = user_links[0].text.strip()
except:
pass
rating = "N/A"
try:
rating_imgs = review_container.find_elements(By.CSS_SELECTOR, 'img[alt*="star"]')
if rating_imgs:
rating = rating_imgs[0].get_attribute('alt').split()[0]
except:
pass
date = "N/A"
try:
date_spans = review_container.find_elements(By.TAG_NAME, 'span')
for span in date_spans:
span_text = span.text.strip()
if '/' in span_text or 'ago' in span_text.lower() or 'day' in span_text.lower():
date = span_text
break
except:
pass
reviews.append({
'user_name': user_name,
'rating': rating,
'date': date,
'text': review_text
})
print(f" Review #{len(reviews)}: {user_name} - {rating} stars")
except:
continue
except Exception as e:
print(f" Error scraping reviews: {e}")
print(f" Total {len(reviews)} reviews scraped\n")
return reviews
This approach uses partial matching with [class*=”comment”] and [class*=”raw”]. This makes the web scraping tool more resilient, since Yelp frequently changes exact class names. The script extracts the review text, user name, rating, and date.
Step 4: Putting it all together
Now we combine everything into a workflow that searches for businesses, scrapes product reviews, and saves the results to CSV.
def scrape_multiple_businesses(self, search_term, location, max_businesses=3, reviews_per_business=10):
businesses = self.search_businesses(search_term, location, max_businesses)
if not businesses:
print("No businesses found!")
return []
all_results = []
for i, business in enumerate(businesses, 1):
print(f"\n[{i}/{len(businesses)}] Processing {business['name']}...")
print("-" * 60)
try:
reviews = self.scrape_reviews(business['url'], max_reviews=reviews_per_business)
for review in reviews:
review['business_name'] = business['name']
review['business_url'] = business['url']
all_results.append(review)
time.sleep(2)
except Exception as e:
print(f" Error: {str(e)}")
continue
return all_results
def save_to_csv(self, reviews, filename="yelp_reviews.csv"):
if not reviews:
print("No reviews to save!")
return
fieldnames = ['business_name', 'business_url', 'user_name', 'rating', 'date', 'text']
with open(filename, 'w', newline="", encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(reviews)
print(f"\n{'='*60}")
print(f"SUCCESS: {len(reviews)} reviews saved to '{filename}'")
print(f"{'='*60}")
def close(self):
self.driver.quit()
Example run
A two-second delay between businesses prevents rate limiting. In tests, two seconds was reliable, but you can reduce it to one second for small runs or increase it to 5 seconds for large-scale scraping.
Each review is tagged with the business name and URL before being saved, allowing you to trace the source.
With the default settings of 3 businesses and 10 reviews each, the script collects ~30 reviews in 2–3 minutes. Once stable, you can scale up:
- 10 businesses × 20 reviews each: ~200 reviews in ~10 minutes
- 20 businesses × 50 reviews each: ~1000 reviews in ~15–20 minutes
The CSV file includes columns for business name, URL, username, rating, date, and review text. It can be opened in Excel or imported into pandas for analyzing customer feedback.
if __name__ == "__main__":
scraper = YelpReviewScraper(headless=False)
try:
print("=" * 60)
print("YELP REVIEW SCRAPER")
print("=" * 60)
reviews = scraper.scrape_multiple_businesses(
search_term="coffee",
location="San Francisco",
max_businesses=3,
reviews_per_business=10
)
print(f"\n{'='*60}")
print(f"SUMMARY: {len(reviews)} total reviews scraped!")
print(f"{'='*60}\n")
for i, review in enumerate(reviews[:3], 1):
print(f"Review #{i}:")
print(f"Business: {review['business_name']}")
print(f"User: {review['user_name']}")
print(f"Rating: {review['rating']}")
print(f"Review: {review['text'][:80]}...")
print("-" * 60)
if reviews:
scraper.save_to_csv(reviews, "yelp_coffee_reviews.csv")
finally:
scraper.close()
How to Scrape Trustpilot Reviews with Python
Step 1: Setting up and searching for companies
Required libraries
import requests
from bs4 import BeautifulSoup
import json
import time
from urllib.parse import quote
We import the necessary libraries:
- Requests: handles HTTP requests
- BeautifulSoup: parses the HTML we receive
- JSON: saves data in a structured format
- time: adds delays to avoid overwhelming the server
- quote from urllib.parse: encodes search terms for URLs
The search function
def search_travel_agencies(search_term="travel agency", location=None):
encoded_query = quote(search_term)
if location:
encoded_location = quote(location)
url = f"https://www.trustpilot.com/search?query={encoded_query}&location={encoded_location}"
else:
url = f"https://www.trustpilot.com/search?query={encoded_query}"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
}
The search term is URL-encoded with quote(). If a location is provided, it’s added as a parameter. Custom headers mimic a real browser to reduce blocking, with a User-Agent string that identifies us as Chrome on Windows.
Making the request and parsing results
try:
response = requests.get(url, headers=headers, timeout=15)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
companies = []
selectors = [
('a[href*="/review/"]', 'href'),
('.link_internal__7XN06', 'href'),
('.styles_businessUnitLink__AjhzZ', 'href'),
('div[class*="businessUnit"] a', 'href'),
('article a[href*="/review/"]', 'href')
]
for selector, attr in selectors:
links = soup.select(selector)
if links:
for link in links[:5]:
href = link.get(attr, "")
if '/review/' in href and href not in [c.get('url', "") for c in companies]:
company_slug = href.split('/review/')[-1].strip('/').split('?')[0]
if company_slug:
full_name = company_slug.replace('-', ' ').replace('www.', '').title()
companies.append({
'slug': company_slug,
'name': full_name,
'url': f"/review/{company_slug}"
})
if len(companies) >= 3:
break
return companies[:3]
except Exception as e:
print(f"Search error: {e}")
return []
We send the request with headers and parse results using BeautifulSoup. Because Trustpilot often changes class names, multiple selectors are defined.
Each selector targets links with /review/, which mark company pages. From each link, we extract the slug (unique identifier in the URL), clean it into a readable name, and return the first three companies found.
Step 2: Fetching review data from company pages
Review the fetching function
def fetch_reviews(company_slug, max_reviews=10):
reviews = []
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5'
}
page = 1
while len(reviews) < max_reviews:
url = f"https://www.trustpilot.com/review/{company_slug}?page={page}"
try:
response = requests.get(url, headers=headers, timeout=15)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
The function loops through pages until the desired number of reviews is collected. Each page is requested with headers to avoid detection, and pagination is handled by incrementing the page parameter.
Extracting review cards
review_selectors = [
'article[data-service-review-card-paper]',
'div[data-service-review-card-paper]',
'article.paper_paper__lyudo',
'div.styles_reviewCard__hcAvI',
'article'
]
review_cards = []
for selector in review_selectors:
review_cards = soup.select(selector)
if review_cards:
break
if not review_cards:
break
We try multiple selectors because Trustpilot frequently changes its design. Each selector targets possible review card structures. If none match, we stop scraping.
Parsing individual reviews
for card in review_cards:
if len(reviews) >= max_reviews:
break
review = {}
# Extract rating
star_elem = card.find('div', {'class': lambda x: x and 'star' in x.lower()})
if not star_elem:
star_elem = card.find('img', {'alt': lambda x: x and 'star' in x.lower()})
if star_elem:
review['stars'] = star_elem.get('alt', 'N/A') if star_elem.name == 'img' else star_elem.find('img').get('alt', 'N/A')
else:
review['stars'] = 'N/A'
# Extract title
title = card.find('h2') or card.find('h3')
review['title'] = title.get_text(strip=True) if title else 'N/A'
# Extract review text
text = card.find('p', {'data-service-review-text-typography': True})
if not text:
text = card.find('p', {'class': lambda x: x and 'body' in str(x).lower()})
review['text'] = text.get_text(strip=True) if text else 'N/A'
# Extract date
date = card.find('time')
review['date'] = date.get('datetime', date.get_text(strip=True)) if date else 'N/A'
# Extract user
user = card.find('span', {'data-consumer-name-typography': True})
if not user:
user = card.find('span', {'class': lambda x: x and 'consumer' in str(x).lower()})
review['user'] = user.get_text(strip=True) if user else 'N/A'
reviews.append(review)
page += 1
time.sleep(2)
except Exception as e:
break
return reviews[:max_reviews]
For each review, we extract the rating, title, review text, date, and username. Flexible selectors (with lambda) make the scraper resilient to HTML changes.
After processing each page, we add a 2-second delay using time.sleep(2). This is crucial for being respectful to Trustpilot’s servers and avoiding rate limiting or IP bans.
Step 3: Main program and output
Main function setup
def main():
print("="*60)
print("TRUSTPILOT REVIEW SCRAPER")
print("="*60)
search_term = "travel agency"
location = "United States"
max_reviews_per_company = 10
if location:
print(f"Location: {location}")
print(f"Search term: {search_term}")
print(f"Max reviews per company: {max_reviews_per_company}\n")
companies = search_travel_agencies(search_term, location)
if not companies:
print("\nNo companies found!")
print("Using fallback: Known travel agencies\n")
companies = [
{'slug': 'www.booking.com', 'name': 'Booking.com'},
{'slug': 'www.expedia.com', 'name': 'Expedia'},
{'slug': 'www.tripadvisor.com', 'name': 'TripAdvisor'}
]
print(f"Found {len(companies)} travel agencies\n")
print("-"*60)
This main function defines the search term, location, and review limit. The location can be set to any country (e.g., “Germany”) or None for global results. The fallback ensures functionality even if the search fails.
Collecting and saving data
all_data = {}
for i, company in enumerate(companies, 1):
print(f"\n[{i}/{len(companies)}] Fetching reviews for {company['name']}...")
reviews = fetch_reviews(company['slug'], max_reviews=max_reviews_per_company)
all_data[company['name']] = {
'url': f"trustpilot.com/review/{company['slug']}",
'review_count': len(reviews),
'reviews': reviews
}
print(f"✓ {len(reviews)} reviews fetched")
time.sleep(2)
filename = 'travel_agency_reviews.json'
with open(filename, 'w', encoding='utf-8') as f:
json.dump(all_data, f, ensure_ascii=False, indent=2)
Each company’s reviews are stored in a dictionary with metadata (URL, review count). A 2-second delay is added between companies to respect Trustpilot’s servers. Finally, the results are saved to a JSON file with UTF-8 encoding.
Displaying results
print("\n" + "="*60)
print("REVIEWS SUMMARY")
print("="*60)
total_reviews = 0
for company_name, data in all_data.items():
print(f"\n{'='*60}")
print(f"AGENCY: {company_name}")
print(f"Total Reviews: {data['review_count']}")
print("="*60)
total_reviews += data['review_count']
for idx, review in enumerate(data['reviews'], 1):
print(f"\nReview #{idx}")
print(f" User: {review.get('user', 'N/A')}")
print(f" Stars: {review.get('stars', 'N/A')}")
print(f" Title: {review.get('title', 'N/A')}")
print(f" Review: {review.get('text', 'N/A')}")
print("-"*40)
print("\n" + "="*60)
print(f"Total {total_reviews} reviews scraped")
print(f"Data saved to '{filename}'")
print("="*60 + "\n")
if __name__ == '__main__':
main()
The script prints a clean summary of all reviews. Each review displays the user, rating, title, and text. The .get() method ensures missing fields default to ‘N/A‘. Finally, the script confirms the total reviews scraped and the JSON filename.
Final thoughts
Scraping reviews from Google Play, Yelp, and Trustpilot required different Python approaches. Each scraper exported ~30 reviews per run in CSV/JSON with usernames, ratings, dates, and text.
The benchmarks below show the key differences:
FAQs about review scraping

Be the first to comment
Your email address will not be published. All fields are required.