Created
May 22, 2023 14:30
-
-
Save tanu360/c5dca914134045c09b6470a8c154fc17 to your computer and use it in GitHub Desktop.
Extract Data from Flipkart Search.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Tanu360 - Flipkart Mobiles Scraping Under 50000 | |
| import codecs | |
| import re | |
| import pandas as pd | |
| import requests | |
| from bs4 import BeautifulSoup | |
| print("👋 Hello from Alex") | |
| pages_to_scrape = int(input(" Total Pages to Scrape : ")) | |
| Product_name = [] | |
| Price = [] | |
| Description = [] | |
| Reviews = [] | |
| Reviews_Count = [] | |
| for i in range(1, pages_to_scrape + 1): | |
| url = "https://flipkart.com/search?q=mobile+under+50000&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=off&as=off&page=" + str(i) | |
| r = requests.get(url) | |
| soup = BeautifulSoup(r.text, "lxml") | |
| box = soup.find('div', class_="_1YokD2 _3Mn1Gg") | |
| # Extract product names. | |
| names = box.find_all('div', class_='_4rR01T') | |
| for name in names: | |
| text = name.text if name else "" | |
| Product_name.append(text) | |
| # Extract prices. | |
| prices = box.find_all('div', class_='_30jeq3 _1_WHN1') | |
| for price in prices: | |
| text = price.text.strip() if price else "" | |
| Price.append(text) | |
| # Extract descriptions. | |
| desc = box.find_all('ul', class_='_1xgFaf') | |
| for d in desc: | |
| text = ' | '.join(list(d.strings)) if d else "" | |
| Description.append(text) | |
| # Extract reviews. | |
| reviews = box.find_all('div', class_='_3LWZlK') | |
| for review in reviews: | |
| if review.text: # Check if review text is available. | |
| review_words = review.text.split() | |
| review_str = ' '.join(review_words) | |
| else: | |
| review_str = "" | |
| Reviews.append(review_str) | |
| # Extract reviews count. | |
| reviews_count = box.find_all('span', class_='_2_R_DZ') | |
| for review_count in reviews_count: | |
| if review_count.text: | |
| match = re.search(r'(\d+)\s+R|reviews?', review_count.text) | |
| if match: | |
| review_count_str = match.group(1) | |
| else: | |
| review_count_str = "" | |
| else: | |
| review_count_str = "" | |
| Reviews_Count.append(review_count_str) | |
| # Determine the minimum length of the extracted data. | |
| min_len = min(len(Product_name), len(Price), len(Description), len(Reviews), len(Reviews_Count)) | |
| # Create a DataFrame with the extracted data. | |
| df = pd.DataFrame({ | |
| "Product Name": Product_name[:min_len], | |
| "Price": Price[:min_len], | |
| "Description": Description[:min_len], | |
| "Reviews Score": Reviews[:min_len], | |
| "Reviews Count": Reviews_Count[:min_len] | |
| }) | |
| # Save the DataFrame to a CSV file. | |
| df.to_csv("flipkart_data.csv", sep=",", encoding="utf-8-sig", mode='a', index=False, header=True) | |
| # Print the DataFrame. | |
| print(f"\n==> Page {i} Data :\n") | |
| print("--------------------") | |
| print(df) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment