Skip to content

Instantly share code, notes, and snippets.

@sherrytp
Created December 6, 2022 03:47
Show Gist options
  • Select an option

  • Save sherrytp/1cc2d3ae1dbd65a14cac9bb78555e5d0 to your computer and use it in GitHub Desktop.

Select an option

Save sherrytp/1cc2d3ae1dbd65a14cac9bb78555e5d0 to your computer and use it in GitHub Desktop.
Python Spyder Scrapping Codes of Spy X Family on Bilibili.com
# -*- coding: utf-8 -*-
"""
Created on Mon Sep 10 19:36:24 2018
@author: hzp0625
"""
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
import numpy as np
import pandas as pd
import time
import os
#os.chdir('F:\\python_study\\pachong\\工作细胞')
def gethtml(url):
browser = webdriver.Chrome()
browser.get(url)
browser.implicitly_wait(10)
return(browser)
def getComment(url):
browser = gethtml(url)
i = 1
AllArticle = pd.DataFrame(columns = ['id','author','comment','stars1','stars2','stars3','stars4','stars5','unlike','like'])
print('连接成功,开始爬取数据')
while True:
xpath1 = '//*[@id="app"]/div[2]/div[2]/div/div[1]/div/div/div[4]/div/div/ul/li[{}]'.format(i)
try:
target = browser.find_element(By.XPATH, xpath1)
except:
print('全部爬完')
break
author = target.find_element(By.XPATH, 'div[1]/div[2]').text
comment = target.find_element(By.XPATH, 'div[2]/div').text
stars1 = target.find_element(By.XPATH, 'div[1]/div[3]/span/i[1]').get_attribute('class')
stars2 = target.find_element(By.XPATH, 'div[1]/div[3]/span/i[2]').get_attribute('class')
stars3 = target.find_element(By.XPATH, 'div[1]/div[3]/span/i[3]').get_attribute('class')
stars4 = target.find_element(By.XPATH, 'div[1]/div[3]/span/i[4]').get_attribute('class')
stars5 = target.find_element(By.XPATH, 'div[1]/div[3]/span/i[5]').get_attribute('class')
date = target.find_element(By.XPATH, 'div[1]/div[4]').text
like = target.find_element(By.XPATH, 'div[3]/div[1]').text
unlike = target.find_element(By.XPATH, 'div[3]/div[2]').text
comments = pd.DataFrame([i, author, comment, stars1, stars2, stars3, stars4, stars5, like, unlike]).T
comments.columns = ['id', 'author', 'comment', 'stars1', 'stars2', 'stars3', 'stars4', 'stars5', 'unlike', 'like']
AllArticle = pd.concat([AllArticle,comments],axis = 0)
browser.execute_script("arguments[0].scrollIntoView();", target)
i = i + 1
if i%100 == 0:
print('已爬取{}条'.format(i))
AllArticle = AllArticle.reset_index(drop = True)
return AllArticle
url = "https://www.bilibili.com/bangumi/media/md28237119/#short"
result = getComment(url)
result.to_csv('spyxfamily.csv',index = False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment