Skip to content

Instantly share code, notes, and snippets.

@tanu360
Created May 22, 2023 14:30
Show Gist options
  • Select an option

  • Save tanu360/c5dca914134045c09b6470a8c154fc17 to your computer and use it in GitHub Desktop.

Select an option

Save tanu360/c5dca914134045c09b6470a8c154fc17 to your computer and use it in GitHub Desktop.
Extract Data from Flipkart Search.
# Tanu360 - Flipkart Mobiles Scraping Under 50000
import codecs
import re
import pandas as pd
import requests
from bs4 import BeautifulSoup
print("👋 Hello from Alex")
pages_to_scrape = int(input(" Total Pages to Scrape : "))
Product_name = []
Price = []
Description = []
Reviews = []
Reviews_Count = []
for i in range(1, pages_to_scrape + 1):
url = "https://flipkart.com/search?q=mobile+under+50000&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=off&as=off&page=" + str(i)
r = requests.get(url)
soup = BeautifulSoup(r.text, "lxml")
box = soup.find('div', class_="_1YokD2 _3Mn1Gg")
# Extract product names.
names = box.find_all('div', class_='_4rR01T')
for name in names:
text = name.text if name else ""
Product_name.append(text)
# Extract prices.
prices = box.find_all('div', class_='_30jeq3 _1_WHN1')
for price in prices:
text = price.text.strip() if price else ""
Price.append(text)
# Extract descriptions.
desc = box.find_all('ul', class_='_1xgFaf')
for d in desc:
text = ' | '.join(list(d.strings)) if d else ""
Description.append(text)
# Extract reviews.
reviews = box.find_all('div', class_='_3LWZlK')
for review in reviews:
if review.text: # Check if review text is available.
review_words = review.text.split()
review_str = ' '.join(review_words)
else:
review_str = ""
Reviews.append(review_str)
# Extract reviews count.
reviews_count = box.find_all('span', class_='_2_R_DZ')
for review_count in reviews_count:
if review_count.text:
match = re.search(r'(\d+)\s+R|reviews?', review_count.text)
if match:
review_count_str = match.group(1)
else:
review_count_str = ""
else:
review_count_str = ""
Reviews_Count.append(review_count_str)
# Determine the minimum length of the extracted data.
min_len = min(len(Product_name), len(Price), len(Description), len(Reviews), len(Reviews_Count))
# Create a DataFrame with the extracted data.
df = pd.DataFrame({
"Product Name": Product_name[:min_len],
"Price": Price[:min_len],
"Description": Description[:min_len],
"Reviews Score": Reviews[:min_len],
"Reviews Count": Reviews_Count[:min_len]
})
# Save the DataFrame to a CSV file.
df.to_csv("flipkart_data.csv", sep=",", encoding="utf-8-sig", mode='a', index=False, header=True)
# Print the DataFrame.
print(f"\n==> Page {i} Data :\n")
print("--------------------")
print(df)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment