Skip to content

Instantly share code, notes, and snippets.

@meisa233
Last active November 25, 2025 09:45
Show Gist options
  • Select an option

  • Save meisa233/cd61722fd3027b1720aed9cda8fc4a51 to your computer and use it in GitHub Desktop.

Select an option

Save meisa233/cd61722fd3027b1720aed9cda8fc4a51 to your computer and use it in GitHub Desktop.
爬关注某贴吧的会员以及做对比
def load_usernames(filename):
with open(filename, 'r', encoding='utf-8') as f:
# 去除换行符,并过滤空行
return set(line.strip() for line in f if line.strip())
def main():
a_members = load_usernames('a_members.txt')
b_members = load_usernames('b_members.txt')
common_members = a_members & b_members # 交集操作
print(f'共有 {len(common_members)} 个相同的会员名:')
for name in sorted(common_members):
print(name)
# 如果需要写入文件,可以取消下面注释
with open('common_members.txt', 'w', encoding='utf-8') as f_out:
for name in sorted(common_members):
f_out.write(name + '\n')
if __name__ == '__main__':
main()
pip install lxml certifi beautifulsoup4 --trusted-host mirrors.tuna.tsinghua.edu.cn -i http://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
import urllib.request as request
import urllib.parse
from bs4 import BeautifulSoup
import re
import time
import logging
import ssl
import certifi
# 日志配置
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
consoleHandler = logging.StreamHandler()
consoleHandler.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
consoleHandler.setFormatter(formatter)
logger.addHandler(consoleHandler)
tieba_name = 'a'
encoded_word = urllib.parse.quote(tieba_name.encode('gbk'))
base_url = f'http://tieba.baidu.com/bawu2/platform/listMemberInfo?word={encoded_word}'
def get_total_pages():
try:
ssl._create_default_https_context = ssl._create_unverified_context
req = request.Request(base_url, headers={'User-Agent': 'Mozilla/5.0'})
html = request.urlopen(req).read().decode('gbk')
soup = BeautifulSoup(html, 'lxml')
page_span = soup.find('span', class_='tbui_total_page')
if not page_span:
logger.error('未找到总页数元素,可能页面结构变了')
return 0
p = re.compile(r'共(\d+)页')
match = p.search(page_span.text)
if match:
total_pages = int(match.group(1))
logger.info(f'总页数:{total_pages}')
return total_pages
except Exception as e:
logger.error(f'获取总页数异常:{e}')
return 0
def find_all_users(start_page, end_page):
user_list = []
for i in range(start_page, end_page + 1):
url = f'{base_url}&pn={i}'
logger.info(f'爬取第{i}页')
try:
headers = {'User-Agent': 'Mozilla/5.0'}
req = request.Request(url, headers=headers)
html = request.urlopen(req).read().decode('gbk')
soup = BeautifulSoup(html, 'lxml')
outer_div = soup.find('div', class_='forum_info_section member_wrap clearfix bawu-info')
if not outer_div:
logger.warning(f'第{i}页未找到会员列表,可能已到末尾或页面结构变更')
break
user_spans = outer_div.find_all('span', class_='member')
for span in user_spans:
a_tag = span.find('a', class_='user_name')
if a_tag and a_tag.string:
username = a_tag.string.strip()
logger.info(f'找到用户:{username}')
user_list.append(username)
time.sleep(0.5)
except Exception as e:
logger.error(f'第{i}页抓取异常: {e}')
return user_list
if __name__ == '__main__':
total_pages = get_total_pages()
if total_pages > 0:
# 这里设置最多爬10页,避免无效请求。你可以改成合适的页数
total_pages = min(total_pages, 310)
users = find_all_users(1, total_pages)
logger.info(f'共收集到用户数:{len(users)}')
# 导出到文本文件
with open(f'{tieba_name}_members.txt', 'w', encoding='utf-8') as f:
for user in users:
f.write(user + '\n')
logger.info('用户列表已保存到 pingpangxiaohua_members.txt')
else:
logger.error('无法获取总页数,爬虫终止')
import urllib.request as request
import urllib.parse
from bs4 import BeautifulSoup
import re
import time
import logging
import ssl
# 跳过SSL验证(仅用于测试)
ssl._create_default_https_context = ssl._create_unverified_context
# 日志配置
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
consoleHandler = logging.StreamHandler()
consoleHandler.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
consoleHandler.setFormatter(formatter)
logger.addHandler(consoleHandler)
tieba_name = '超绝海南小猪kk'
encoded_word = urllib.parse.quote(tieba_name.encode('gbk'))
base_url = f'http://tieba.baidu.com/bawu2/platform/listMemberInfo?word={encoded_word}'
def get_total_pages():
headers = {'User-Agent': 'Mozilla/5.0'}
req = request.Request(base_url, headers=headers)
try:
html = request.urlopen(req).read().decode('gbk')
soup = BeautifulSoup(html, 'html.parser')
page_span = soup.find('span', class_='tbui_total_page')
if not page_span:
logger.error('未找到总页数元素,可能页面结构变了')
return 0
p = re.compile(r'共(\d+)页')
match = p.search(page_span.text)
if match:
total_pages = int(match.group(1))
logger.info(f'总页数:{total_pages}')
return total_pages
except Exception as e:
logger.error(f'获取总页数异常:{e}')
return 0
def extract_level(span_tag):
"""从span标签的class中提取等级"""
if not span_tag:
return 0
classes = span_tag.get('class', [])
for cls in classes:
if cls.startswith('bawu-info-lv'):
# 提取 lv 后面的数字
level_match = re.search(r'lv(\d+)', cls)
if level_match:
return int(level_match.group(1))
return 0
def find_all_users(start_page, end_page):
user_list = []
for i in range(start_page, end_page + 1):
url = f'{base_url}&pn={i}'
logger.info(f'爬取第{i}页')
try:
headers = {'User-Agent': 'Mozilla/5.0'}
req = request.Request(url, headers=headers)
html = request.urlopen(req).read().decode('gbk')
soup = BeautifulSoup(html, 'html.parser')
# 找到所有的 name_wrap div
name_wraps = soup.find_all('div', class_='name_wrap')
if not name_wraps:
logger.warning(f'第{i}页未找到会员列表,可能已到末尾或页面结构变更')
break
for wrap in name_wraps:
# 提取用户名
a_tag = wrap.find('a', class_='user_name')
if a_tag and a_tag.string:
username = a_tag.string.strip()
# 提取等级
level_span = wrap.find('span', class_='forum-level-bawu')
level = extract_level(level_span)
logger.info(f'找到用户:{username}, 等级:{level}')
user_list.append({'username': username, 'level': level})
time.sleep(0.5)
except Exception as e:
logger.error(f'第{i}页抓取异常: {e}')
return user_list
if __name__ == '__main__':
total_pages = get_total_pages()
if total_pages > 0:
# 这里设置最多爬10页,你可以改成合适的页数
total_pages = min(total_pages, 1000)
users = find_all_users(1, total_pages)
logger.info(f'共收集到用户数:{len(users)}')
# 导出到文本文件(CSV格式)
with open(f'{tieba_name}_members_level.txt', 'w', encoding='utf-8') as f:
f.write('用户名,等级') # 写入表头
for user in users:
f.write(f"{user['username']},{user['level']}\n")
logger.info(f'用户列表已保存到 {tieba_name}_members_level.txt')
# 也可以打印统计信息
if users:
max_level = max(user['level'] for user in users)
min_level = min(user['level'] for user in users)
avg_level = sum(user['level'] for user in users) / len(users)
logger.info(f'等级统计 - 最高:{max_level}, 最低:{min_level}, 平均:{avg_level:.2f}')
else:
logger.error('无法获取总页数,爬虫终止')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment