如何在Python中使用socks5代理进行爬虫操作?
以下是使用Python结合socks5ip.com.cn代理IP进行爬虫操作的完整实现。
import requests
from urllib3.exceptions import InsecureRequestWarning
import time
import random
from typing import Dict, List, Optional
import json
import logging
# 禁用SSL警告
requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)
class ProxyCrawler:
def __init__(self, proxy_config: Dict[str, str]):
“””
初始化代理爬虫
:param proxy_config: 代理配置字典,包含ip, port, username, password
“””
self.proxy_config = proxy_config
self.session = requests.Session()
self.setup_logging()
self.setup_proxy()
def setup_logging(self):
“””配置日志”””
logging.basicConfig(
level=logging.INFO,
format=’%(asctime)s – %(levelname)s – %(message)s’,
handlers=[
logging.FileHandler(‘proxy_crawler.log’, encoding=’utf-8′),
logging.StreamHandler()
]
)
self.logger = logging.getLogger(__name__)
def setup_proxy(self):
“””配置代理”””
proxy_url = f”socks5://{self.proxy_config[‘username’]}:{self.proxy_config[‘password’]}@{self.proxy_config[‘ip’]}:{self.proxy_config[‘port’]}”
self.proxies = {
‘http’: proxy_url,
‘https’: proxy_url
}
self.session.proxies.update(self.proxies)
self.logger.info(f”代理已配置: {self.proxy_config[‘ip’]}:{self.proxy_config[‘port’]}”)
def test_proxy(self) -> bool:
“””
测试代理是否可用
:return: 代理是否可用
“””
try:
response = self.session.get(‘https://httpbin.org/ip’, timeout=10, verify=False)
if response.status_code == 200:
ip_info = response.json()
self.logger.info(f”代理测试成功,当前IP: {ip_info.get(‘origin’, ‘Unknown’)}”)
return True
else:
self.logger.error(f”代理测试失败,状态码: {response.status_code}”)
return False
except Exception as e:
self.logger.error(f”代理测试异常: {str(e)}”)
return False
def get_random_headers(self) -> Dict[str, str]:
“””
获取随机请求头
:return: 请求头字典
“””
user_agents = [
‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36’,
‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36’,
‘Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36’
]
return {
‘User-Agent’: random.choice(user_agents),
‘Accept’: ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8’,
‘Accept-Language’: ‘zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3’,
‘Accept-Encoding’: ‘gzip, deflate’,
‘Connection’: ‘keep-alive’,
‘Upgrade-Insecure-Requests’: ‘1’,
}
def crawl_with_retry(self, url: str, max_retries: int = 3) -> Optional[requests.Response]:
“””
带重试机制的爬取方法
:param url: 目标URL
:param max_retries: 最大重试次数
:return: 响应对象或None
“””
for attempt in range(max_retries):
try:
headers = self.get_random_headers()
response = self.session.get(
url,
headers=headers,
timeout=15,
verify=False
)
if response.status_code == 200:
self.logger.info(f”成功爬取: {url}”)
return response
else:
self.logger.warning(f”请求失败 (尝试 {attempt+1}/{max_retries}): {url}, 状态码: {response.status_code}”)
except Exception as e:
self.logger.warning(f”请求异常 (尝试 {attempt+1}/{max_retries}): {url}, 错误: {str(e)}”)
# 重试前等待随机时间
if attempt < max_retries – 1:
wait_time = random.uniform(1, 3)
time.sleep(wait_time)
self.logger.error(f”爬取失败 {max_retries} 次后放弃: {url}”)
return None
def crawl_multiple_urls(self, urls: List[str]) -> Dict[str, Dict]:
“””
批量爬取多个URL
:param urls: URL列表
:return: 爬取结果字典
“””
results = {}
for i, url in enumerate(urls):
self.logger.info(f”正在爬取 ({i+1}/{len(urls)}): {url}”)
# 添加随机延时,避免请求过于频繁
if i > 0:
delay = random.uniform(0.5, 2)
time.sleep(delay)
response = self.crawl_with_retry(url)
if response:
results[url] = {
‘status’: ‘success’,
‘status_code’: response.status_code,
‘content_length’: len(response.content),
‘title’: self.extract_title(response.text)
}
else:
results[url] = {
‘status’: ‘failed’,
‘status_code’: None,
‘content_length’: 0,
‘title’: None
}
return results
def extract_title(self, html_content: str) -> str:
“””
从HTML内容中提取标题
:param html_content: HTML内容
:return: 页面标题
“””
try:
start = html_content.find(‘<title>’) + 7
end = html_content.find(‘</title>’)
if start > 6 and end > start:
return html_content[start:end].strip()
except:
pass
return “未知标题”
def save_results(self, results: Dict, filename: str = ‘crawl_results.json’):
“””
保存爬取结果到JSON文件
:param results: 爬取结果
:param filename: 保存文件名
“””
try:
with open(filename, ‘w’, encoding=’utf-8′) as f:
json.dump(results, f, ensure_ascii=False, indent=2)
self.logger.info(f”结果已保存到: {filename}”)
except Exception as e:
self.logger.error(f”保存结果失败: {str(e)}”)
def main():
# 从socks5ip.com.cn后台获取的代理配置信息
proxy_config = {
‘ip’: ‘your_proxy_ip’, # 替换为实际IP
‘port’: ‘your_port’, # 替换为实际端口
‘username’: ‘your_username’, # 替换为实际用户名
‘password’: ‘your_password’ # 替换为实际密码
}
# 创建爬虫实例
crawler = ProxyCrawler(proxy_config)
# 测试代理是否可用
if not crawler.test_proxy():
print(“代理测试失败,请检查代理配置”)
return
# 定义要爬取的URL列表
urls = [
‘https://httpbin.org/ip’,
‘https://httpbin.org/user-agent’,
‘https://httpbin.org/headers’,
‘https://www.example.com’,
‘https://httpbin.org/delay/2’ # 测试延迟
]
print(“开始批量爬取…”)
results = crawler.crawl_multiple_urls(urls)
# 打印结果摘要
success_count = sum(1 for r in results.values() if r[‘status’] == ‘success’)
print(f”\n爬取完成! 成功: {success_count}/{len(urls)}”)
# 保存结果
crawler.save_results(results)
# 打印详细结果
for url, result in results.items():
print(f”\nURL: {url}”)
print(f”状态: {result[‘status’]}”)
if result[‘status’] == ‘success’:
print(f”标题: {result[‘title’]}”)
print(f”内容大小: {result[‘content_length’]} 字节”)
if __name__ == “__main__”:
main()
代码说明:
- ProxyCrawler类封装了完整的代理爬虫功能,包括代理配置、测试、请求发送等
- 支持SOCKS5代理协议,通过requests库的socks支持实现
- 实现了智能重试机制,在请求失败时自动重试并加入随机延时
- 包含随机User-Agent生成,降低被反爬虫识别的风险
- 提供批量URL爬取功能,可同时处理多个目标站点
- 集成日志记录系统,详细记录爬取过程和错误信息
- 支持结果保存为JSON格式,便于后续分析处理
- requirements.txt文件包含所有必要的依赖包配置
全网低价IP-国内外IP源头渠道(socks5ip.com.cn)
支持无双IP(海外)、奔富IP、天行IP、沧海IP、光子IP、天机IP、优享云IP、鲸云IP、糖果IP等数十个国内外知名IP平台
自助提货,100%独享,免费测试,支持续费和调换
有任何IP使用问题,或量大谈合作,请点击【添加微信】,诚招代理!


评论0