pip install lxml certifi beautifulsoup4 --trusted-host mirrors.tuna.tsinghua.edu.cn -i http://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
Last active
November 25, 2025 09:45
-
-
Save meisa233/cd61722fd3027b1720aed9cda8fc4a51 to your computer and use it in GitHub Desktop.
爬关注某贴吧的会员以及做对比
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def load_usernames(filename): | |
| with open(filename, 'r', encoding='utf-8') as f: | |
| # 去除换行符,并过滤空行 | |
| return set(line.strip() for line in f if line.strip()) | |
| def main(): | |
| a_members = load_usernames('a_members.txt') | |
| b_members = load_usernames('b_members.txt') | |
| common_members = a_members & b_members # 交集操作 | |
| print(f'共有 {len(common_members)} 个相同的会员名:') | |
| for name in sorted(common_members): | |
| print(name) | |
| # 如果需要写入文件,可以取消下面注释 | |
| with open('common_members.txt', 'w', encoding='utf-8') as f_out: | |
| for name in sorted(common_members): | |
| f_out.write(name + '\n') | |
| if __name__ == '__main__': | |
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import urllib.request as request | |
| import urllib.parse | |
| from bs4 import BeautifulSoup | |
| import re | |
| import time | |
| import logging | |
| import ssl | |
| import certifi | |
| # 日志配置 | |
| logger = logging.getLogger() | |
| logger.setLevel(logging.DEBUG) | |
| consoleHandler = logging.StreamHandler() | |
| consoleHandler.setLevel(logging.DEBUG) | |
| formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') | |
| consoleHandler.setFormatter(formatter) | |
| logger.addHandler(consoleHandler) | |
| tieba_name = 'a' | |
| encoded_word = urllib.parse.quote(tieba_name.encode('gbk')) | |
| base_url = f'http://tieba.baidu.com/bawu2/platform/listMemberInfo?word={encoded_word}' | |
| def get_total_pages(): | |
| try: | |
| ssl._create_default_https_context = ssl._create_unverified_context | |
| req = request.Request(base_url, headers={'User-Agent': 'Mozilla/5.0'}) | |
| html = request.urlopen(req).read().decode('gbk') | |
| soup = BeautifulSoup(html, 'lxml') | |
| page_span = soup.find('span', class_='tbui_total_page') | |
| if not page_span: | |
| logger.error('未找到总页数元素,可能页面结构变了') | |
| return 0 | |
| p = re.compile(r'共(\d+)页') | |
| match = p.search(page_span.text) | |
| if match: | |
| total_pages = int(match.group(1)) | |
| logger.info(f'总页数:{total_pages}') | |
| return total_pages | |
| except Exception as e: | |
| logger.error(f'获取总页数异常:{e}') | |
| return 0 | |
| def find_all_users(start_page, end_page): | |
| user_list = [] | |
| for i in range(start_page, end_page + 1): | |
| url = f'{base_url}&pn={i}' | |
| logger.info(f'爬取第{i}页') | |
| try: | |
| headers = {'User-Agent': 'Mozilla/5.0'} | |
| req = request.Request(url, headers=headers) | |
| html = request.urlopen(req).read().decode('gbk') | |
| soup = BeautifulSoup(html, 'lxml') | |
| outer_div = soup.find('div', class_='forum_info_section member_wrap clearfix bawu-info') | |
| if not outer_div: | |
| logger.warning(f'第{i}页未找到会员列表,可能已到末尾或页面结构变更') | |
| break | |
| user_spans = outer_div.find_all('span', class_='member') | |
| for span in user_spans: | |
| a_tag = span.find('a', class_='user_name') | |
| if a_tag and a_tag.string: | |
| username = a_tag.string.strip() | |
| logger.info(f'找到用户:{username}') | |
| user_list.append(username) | |
| time.sleep(0.5) | |
| except Exception as e: | |
| logger.error(f'第{i}页抓取异常: {e}') | |
| return user_list | |
| if __name__ == '__main__': | |
| total_pages = get_total_pages() | |
| if total_pages > 0: | |
| # 这里设置最多爬10页,避免无效请求。你可以改成合适的页数 | |
| total_pages = min(total_pages, 310) | |
| users = find_all_users(1, total_pages) | |
| logger.info(f'共收集到用户数:{len(users)}') | |
| # 导出到文本文件 | |
| with open(f'{tieba_name}_members.txt', 'w', encoding='utf-8') as f: | |
| for user in users: | |
| f.write(user + '\n') | |
| logger.info('用户列表已保存到 pingpangxiaohua_members.txt') | |
| else: | |
| logger.error('无法获取总页数,爬虫终止') | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import urllib.request as request | |
| import urllib.parse | |
| from bs4 import BeautifulSoup | |
| import re | |
| import time | |
| import logging | |
| import ssl | |
| # 跳过SSL验证(仅用于测试) | |
| ssl._create_default_https_context = ssl._create_unverified_context | |
| # 日志配置 | |
| logger = logging.getLogger() | |
| logger.setLevel(logging.DEBUG) | |
| consoleHandler = logging.StreamHandler() | |
| consoleHandler.setLevel(logging.DEBUG) | |
| formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') | |
| consoleHandler.setFormatter(formatter) | |
| logger.addHandler(consoleHandler) | |
| tieba_name = '超绝海南小猪kk' | |
| encoded_word = urllib.parse.quote(tieba_name.encode('gbk')) | |
| base_url = f'http://tieba.baidu.com/bawu2/platform/listMemberInfo?word={encoded_word}' | |
| def get_total_pages(): | |
| headers = {'User-Agent': 'Mozilla/5.0'} | |
| req = request.Request(base_url, headers=headers) | |
| try: | |
| html = request.urlopen(req).read().decode('gbk') | |
| soup = BeautifulSoup(html, 'html.parser') | |
| page_span = soup.find('span', class_='tbui_total_page') | |
| if not page_span: | |
| logger.error('未找到总页数元素,可能页面结构变了') | |
| return 0 | |
| p = re.compile(r'共(\d+)页') | |
| match = p.search(page_span.text) | |
| if match: | |
| total_pages = int(match.group(1)) | |
| logger.info(f'总页数:{total_pages}') | |
| return total_pages | |
| except Exception as e: | |
| logger.error(f'获取总页数异常:{e}') | |
| return 0 | |
| def extract_level(span_tag): | |
| """从span标签的class中提取等级""" | |
| if not span_tag: | |
| return 0 | |
| classes = span_tag.get('class', []) | |
| for cls in classes: | |
| if cls.startswith('bawu-info-lv'): | |
| # 提取 lv 后面的数字 | |
| level_match = re.search(r'lv(\d+)', cls) | |
| if level_match: | |
| return int(level_match.group(1)) | |
| return 0 | |
| def find_all_users(start_page, end_page): | |
| user_list = [] | |
| for i in range(start_page, end_page + 1): | |
| url = f'{base_url}&pn={i}' | |
| logger.info(f'爬取第{i}页') | |
| try: | |
| headers = {'User-Agent': 'Mozilla/5.0'} | |
| req = request.Request(url, headers=headers) | |
| html = request.urlopen(req).read().decode('gbk') | |
| soup = BeautifulSoup(html, 'html.parser') | |
| # 找到所有的 name_wrap div | |
| name_wraps = soup.find_all('div', class_='name_wrap') | |
| if not name_wraps: | |
| logger.warning(f'第{i}页未找到会员列表,可能已到末尾或页面结构变更') | |
| break | |
| for wrap in name_wraps: | |
| # 提取用户名 | |
| a_tag = wrap.find('a', class_='user_name') | |
| if a_tag and a_tag.string: | |
| username = a_tag.string.strip() | |
| # 提取等级 | |
| level_span = wrap.find('span', class_='forum-level-bawu') | |
| level = extract_level(level_span) | |
| logger.info(f'找到用户:{username}, 等级:{level}') | |
| user_list.append({'username': username, 'level': level}) | |
| time.sleep(0.5) | |
| except Exception as e: | |
| logger.error(f'第{i}页抓取异常: {e}') | |
| return user_list | |
| if __name__ == '__main__': | |
| total_pages = get_total_pages() | |
| if total_pages > 0: | |
| # 这里设置最多爬10页,你可以改成合适的页数 | |
| total_pages = min(total_pages, 1000) | |
| users = find_all_users(1, total_pages) | |
| logger.info(f'共收集到用户数:{len(users)}') | |
| # 导出到文本文件(CSV格式) | |
| with open(f'{tieba_name}_members_level.txt', 'w', encoding='utf-8') as f: | |
| f.write('用户名,等级') # 写入表头 | |
| for user in users: | |
| f.write(f"{user['username']},{user['level']}\n") | |
| logger.info(f'用户列表已保存到 {tieba_name}_members_level.txt') | |
| # 也可以打印统计信息 | |
| if users: | |
| max_level = max(user['level'] for user in users) | |
| min_level = min(user['level'] for user in users) | |
| avg_level = sum(user['level'] for user in users) / len(users) | |
| logger.info(f'等级统计 - 最高:{max_level}, 最低:{min_level}, 平均:{avg_level:.2f}') | |
| else: | |
| logger.error('无法获取总页数,爬虫终止') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment