Last active
September 14, 2022 10:01
-
-
Save dimitryzub/fa98a45e009a790758983e49ef70856d to your computer and use it in GitHub Desktop.
Scrape ResearchGate Search - All Questions
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Blog post: https://serpapi.com/blog/web-scraping-all-questions-from-researchgate-search-in-python/ | |
| from parsel import Selector | |
| from playwright.sync_api import sync_playwright | |
| import json | |
| def scrape_researchgate_questions(query: str): | |
| with sync_playwright() as p: | |
| browser = p.chromium.launch(headless=True, slow_mo=50) | |
| page = browser.new_page(user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36") | |
| questions = [] | |
| page_num = 1 | |
| while True: | |
| page.goto(f"https://www.researchgate.net/search/question?q={query}&page={page_num}") | |
| selector = Selector(text=page.content()) | |
| for question in selector.css(".nova-legacy-c-card__body--spacing-inherit"): | |
| title = question.css(".nova-legacy-v-question-item__title .nova-legacy-e-link--theme-bare::text").get().title().strip() | |
| title_link = f'https://www.researchgate.net{question.css(".nova-legacy-v-question-item__title .nova-legacy-e-link--theme-bare::attr(href)").get()}' | |
| question_type = question.css(".nova-legacy-v-question-item__badge::text").get() | |
| question_date = question.css(".nova-legacy-v-question-item__meta-data-item:nth-child(1) span::text").get() | |
| snippet = question.css(".redraft-text").xpath("normalize-space()").get() | |
| views = question.css(".nova-legacy-v-question-item__metrics-item:nth-child(1) .nova-legacy-e-link--theme-bare::text").get() | |
| views_link = f'https://www.researchgate.net{question.css(".nova-legacy-v-question-item__metrics-item:nth-child(1) .nova-legacy-e-link--theme-bare::attr(href)").get()}' | |
| answer = question.css(".nova-legacy-v-question-item__metrics-item+ .nova-legacy-v-question-item__metrics-item .nova-legacy-e-link--theme-bare::text").get() | |
| answer_link = f'https://www.researchgate.net{question.css(".nova-legacy-v-question-item__metrics-item+ .nova-legacy-v-question-item__metrics-item .nova-legacy-e-link--theme-bare::attr(href)").get()}' | |
| questions.append({ | |
| "title": title, | |
| "link": title_link, | |
| "snippet": snippet, | |
| "question_type": question_type, | |
| "question_date": question_date, | |
| "views": { | |
| "views_count": views, | |
| "views_link": views_link | |
| }, | |
| "answer": { | |
| "answer_count": answer, | |
| "answers_link": answer_link | |
| } | |
| }) | |
| print(f"page number: {page_num}") | |
| # checks if next page arrow key is greyed out `attr(rel)` (inactive) and breaks out of the loop | |
| if selector.css(".nova-legacy-c-button-group__item:nth-child(9) a::attr(rel)").get(): | |
| break | |
| else: | |
| page_num += 1 | |
| print(json.dumps(questions, indent=2, ensure_ascii=False)) | |
| browser.close() | |
| scrape_researchgate_questions(query="coffee") |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Part of the output:
[ { "title": "Any Recommendations On An Inexpensive Coffee Grinder To Grind Up Bark Samples To Measure Ph?", "link": "https://www.researchgate.netpost/Any_recommendations_on_an_inexpensive_coffee_grinder_to_grind_up_bark_samples_to_measure_pH?_sg=tsmZvLsXrFpn6TG77ljxS8pVJhdOMYVlqqYhQl0BszqPCDW1__lnpczwZl8XJiVROJ8_8G8jaerzpX8", "snippet": "We are folloiwng protocol by Hansen et al. (2015) Sci. Pharm. They recommend a Rancilio coffee grinder but these are several hundred dollars. Hoping to use something a little less expensive.", "question_type": "Question", "question_date": "Oct 2017", "views": { "views_count": "97 Views", "views_link": "post/Any_recommendations_on_an_inexpensive_coffee_grinder_to_grind_up_bark_samples_to_measure_pH?_sg=tsmZvLsXrFpn6TG77ljxS8pVJhdOMYVlqqYhQl0BszqPCDW1__lnpczwZl8XJiVROJ8_8G8jaerzpX8" }, "answer": { "answer_count": "2 Answers", "answers_link": "https://www.researchgate.netpost/Any_recommendations_on_an_inexpensive_coffee_grinder_to_grind_up_bark_samples_to_measure_pH?_sg=tsmZvLsXrFpn6TG77ljxS8pVJhdOMYVlqqYhQl0BszqPCDW1__lnpczwZl8XJiVROJ8_8G8jaerzpX8" } }, ... other questions { "title": "Are There Any Ways To Find The Concentration Of A Solution Where Its Chemical Formula And Number Of Moles Are Unknown? ", "link": "https://www.researchgate.netpost/Are_there_any_ways_to_find_the_concentration_of_a_solution_where_its_chemical_formula_and_number_of_moles_are_unknown?_sg=6W-hvIYx-FRel_YiWd62lbksTzeWP7GVkZ3tVO6SgZI7F_czhLz_oFCduq9DVhrhvIUy97168wXrn30", "snippet": "A comprehensive way to find the concentration of random solutions would enhance benefits related with health, industry, technology and commercial aspects. Although beer lambert law is a solution, there are some cases where Epsilon is unknown (Example: A Coca-Cola drink or a cup of coffee). In this cases, proper alt", "question_type": "Question", "question_date": "Jan 2022", "views": { "views_count": "742 Views", "views_link": "post/Are_there_any_ways_to_find_the_concentration_of_a_solution_where_its_chemical_formula_and_number_of_moles_are_unknown?_sg=6W-hvIYx-FRel_YiWd62lbksTzeWP7GVkZ3tVO6SgZI7F_czhLz_oFCduq9DVhrhvIUy97168wXrn30" }, "answer": { "answer_count": "4 Answers", "answers_link": "https://www.researchgate.netpost/Are_there_any_ways_to_find_the_concentration_of_a_solution_where_its_chemical_formula_and_number_of_moles_are_unknown?_sg=6W-hvIYx-FRel_YiWd62lbksTzeWP7GVkZ3tVO6SgZI7F_czhLz_oFCduq9DVhrhvIUy97168wXrn30" } } ]