1、上个博文说明了如何本地部署crawlab
部署完后在首页可以看见相关节点。
2、先在pycharm中创建一个爬虫项目
2.1、因为我这里使用的是scrapy框架创建爬虫,所以需要使用命令创建
创建项目:scrapy startproject scrapyTest
查看爬虫: scrapy list
2.2、验证爬虫项目是否创建成功,就是看当前根目录是否有如下文件:
2.3、创建一个py文件,书写自己的爬虫业务逻辑,我这里以人民网为例:
import scrapy
import re
import json
from scrapy import Item, Field
from crawlab import save_item
# 定义数据项
class NewsItem(Item):
title = Field()
publish_time = Field()
content = Field()
url = Field()
source_page = Field()
class PeopleSocietySpider(scrapy.Spider):
name = "SpidersDemo2"
allowed_domains = ["society.people.***.***"]
start_urls = ["http://society.people.***.***/"]
def __init__(self, max_news=None, max_pages=None, *args, **kwargs):
"""
初始化爬虫
:param max_news: 最大新闻条数限制
:param max_pages: 最大页面数限制
"""
super(PeopleSocietySpider, self).__init__(*args, **kwargs)
self.max_news = int(max_news) if max_news else None
self.max_pages = int(max_pages) if max_pages else None
self.news_count = 0 # 已爬取的新闻数量
self.page_count = 0 # 已爬取的页面数量
self.su***ess_count = 0 # 成功解析的新闻数量
self.failed_count = 0 # 解析失败的新闻数量
custom_settings = {
"USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
"DOWNLOAD_DELAY": 1,
"COOKIES_ENABLED": False,
"DEFAULT_REQUEST_HEADERS": {
"A***ept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"A***ept-Language": "zh-***,zh;q=0.8"
},
# 添加 JSON 导出用于调试
"FEEDS": {
'output.json': {
'format': 'json',
'encoding': 'utf8',
'store_empty': False,
'indent': 4,
'overwrite': True
},
},
"CONCURRENT_REQUESTS": 1,
"LOG_LEVEL": "INFO"
}
def parse(self, response):
"""解析人民网社会频道首页"""
self.logger.info(f"开始处理页面:{response.url}")
# 计算已爬取页数
self.page_count += 1
# 检查页面数量限制
if self.max_pages and self.page_count > self.max_pages:
self.logger.info(f"已达到最大页数限制 {self.max_pages},停止爬取")
return
# 提取右侧新闻链接
right_column_news = response.xpath(
'//div[@class="fr"]//div[@class="news_box"]//a/@href').getall()
# 提取本网原创栏目下的新闻链接
original_news_links = response.xpath(
'//h2[@class="qiehuan1 mt15"]/following-sibling::div[@class="headingNews qiehuan1_c"][1]//h5/a/@href').getall()
# 合并两个来源的新闻链接
all_news_links = right_column_news + original_news_links
# 去重
unique_links = list(set(all_news_links))
if not unique_links:
self.logger.warning("未找到新闻链接")
return
# 处理新闻链接
for link in unique_links:
# 检查新闻数量限制
if self.max_news and self.news_count >= self.max_news:
self.logger.info(f"已达到最大新闻数限制 {self.max_news},停止爬取")
return
# 处理相对链接和绝对链接
if link.startswith('http'):
abs_url = link
else:
abs_url = response.urljoin(link)
self.news_count += 1
yield scrapy.Request(
url=abs_url,
callback=self.parse_news_page,
meta={'original_url': response.url} # 保存原始页面URL用于调试
)
# 查找本网原创栏目的下一页链接并跟进
next_page_link = response.xpath(
'//div[@class="page_n clearfix"]//a[contains(text(), "下一页")]/@href').get()
if next_page_link:
# 确保不超过最大页数限制,并且还需要更多新闻
if (not self.max_pages or self.page_count < self.max_pages) and \
(not self.max_news or self.news_count < self.max_news):
next_page_url = response.urljoin(next_page_link)
self.logger.info(f"正在跟进到下一页: {next_page_url}")
yield scrapy.Request(
url=next_page_url,
callback=self.parse
)
else:
reason = ""
if self.max_pages and self.page_count >= self.max_pages:
reason = f"已达到最大页数限制 {self.max_pages}"
elif self.max_news and self.news_count >= self.max_news:
reason = f"已达到最大新闻数限制 {self.max_news}"
self.logger.info(f"{reason},停止爬取")
else:
self.logger.info("没有找到下一页链接,爬取结束")
def parse_news_page(self, response):
"""解析新闻详情页"""
self.logger.info(f"解析新闻详情:{response.url}")
# 提取新闻标题 - 尝试多种选择器
title_selectors = [
'//h1/text()',
'//div[@class="title"]/h1/text()',
'//div[@class="hdNews"]//h5/a/text()'
]
title = ""
for selector in title_selectors:
title = response.xpath(selector).get()
if title:
title = title.strip()
break
# 提取发布时间 - 尝试多种选择器
time_selectors = [
'//div[contains(@class, "col-1-1")]/text()',
'//div[@class="box01"]/div[@class="fl"]/text()',
'//em[@class="gray"]/text()'
]
publish_time = ""
for selector in time_selectors:
time_text = response.xpath(selector).get()
if time_text:
publish_time_match = re.search(
r'\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}|\d{4}年\d{2}月\d{2}日\d{2}:\d{2}|\d{4}\.\d{2}\.\d{2}\s\d{2}:\d{2}',
time_text)
if publish_time_match:
publish_time = publish_time_match.group()
break
# 提取正文内容 - 尝试多种选择器
content_selectors = [
'//div[@class="rm_txt_con cf"]//p/text()',
'//div[@class="show_text"]//p/text()',
'//div[@class="on"]/em//text()',
'//div[contains(@class, "text")]//p/text()'
]
content = ""
for selector in content_selectors:
content_paragraphs = response.xpath(selector).getall()
if content_paragraphs:
content = "".join([p.strip() for p in content_paragraphs if p.strip()])
if content:
break
# 数据清洗
title = title.strip() if title else "未找到标题"
content = content.strip() if content else "未找到内容"
publish_time = publish_time.strip() if publish_time else "未找到发布时间"
# 创建数据字典
item_data = {
"title": title,
"publish_time": publish_time,
"content": content,
"url": response.url,
"source_page": response.meta.get('original_url', 'unknown')
}
# 判断数据是否完整
is_***plete = title and title != "未找到标题" and content and content != "未找到内容"
if is_***plete:
self.su***ess_count += 1
# 使用 Crawlab SDK 保存数据
try:
save_item(item_data)
self.logger.info(f"通过 Crawlab SDK 保存数据成功: {title[:50]}...")
except Exception as e:
self.logger.error(f"通过 Crawlab SDK 保存数据失败: {str(e)}")
else:
self.failed_count += 1
self.logger.warning(
f"数据不完整 - 标题: {'有' if title and title != '未找到标题' else '无'}, 内容: {'有' if content and content != '未找到内容' else '无'}: {response.url}")
# 添加详细的调试日志
self.logger.info(f"成功生成数据项 - 标题: {title[:50]}...")
self.logger.info(f"数据详情 - URL: {response.url}, 时间: {publish_time}")
# 同时打印到控制台,确保 Crawlab 能捕获
print(f"NEWS_ITEM: {json.dumps(item_data, ensure_ascii=False)}")
# 仍然 yield item 用于 JSON 导出
item = NewsItem()
for key, value in item_data.items():
item[key] = value
yield item
def closed(self, reason):
"""爬虫关闭时的回调函数"""
# 从stats中获取实际存储的item数量
stats = self.crawler.stats
item_scraped_count = stats.get_value('item_scraped_count', 0)
self.logger.info(f"爬虫结束运行,原因: {reason}")
self.logger.info(f"统计信息:")
self.logger.info(f"- 总共找到新闻链接: {self.news_count} 条")
self.logger.info(f"- 成功解析新闻: {self.su***ess_count} 条")
self.logger.info(f"- 解析失败新闻: {self.failed_count} 条")
self.logger.info(f"- 实际存储item数量: {item_scraped_count} 条")
self.logger.info(f"- 处理页面数量: {self.page_count} 个")
print(
f"CRAWLER_FINISHED: 找到链接 {self.news_count} 条, 成功解析 {self.su***ess_count} 条, 实际存储 {item_scraped_count} 条")