crawlab上部署自己的爬虫(scrapy)项目

crawlab上部署自己的爬虫(scrapy)项目

1、上个博文说明了如何本地部署crawlab

部署完后在首页可以看见相关节点。

2、先在pycharm中创建一个爬虫项目

2.1、因为我这里使用的是scrapy框架创建爬虫,所以需要使用命令创建

创建项目:scrapy startproject scrapyTest
查看爬虫: scrapy list

2.2、验证爬虫项目是否创建成功,就是看当前根目录是否有如下文件:

2.3、创建一个py文件,书写自己的爬虫业务逻辑,我这里以人民网为例:

import scrapy
import re
import json
from scrapy import Item, Field
from crawlab import save_item


# 定义数据项
class NewsItem(Item):
    title = Field()
    publish_time = Field()
    content = Field()
    url = Field()
    source_page = Field()


class PeopleSocietySpider(scrapy.Spider):
    name = "SpidersDemo2"
    allowed_domains = ["society.people.***.***"]
    start_urls = ["http://society.people.***.***/"]

    def __init__(self, max_news=None, max_pages=None, *args, **kwargs):
        """
        初始化爬虫
        :param max_news: 最大新闻条数限制
        :param max_pages: 最大页面数限制
        """
        super(PeopleSocietySpider, self).__init__(*args, **kwargs)
        self.max_news = int(max_news) if max_news else None
        self.max_pages = int(max_pages) if max_pages else None
        self.news_count = 0  # 已爬取的新闻数量
        self.page_count = 0  # 已爬取的页面数量
        self.su***ess_count = 0  # 成功解析的新闻数量
        self.failed_count = 0  # 解析失败的新闻数量

    custom_settings = {
        "USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
        "DOWNLOAD_DELAY": 1,
        "COOKIES_ENABLED": False,
        "DEFAULT_REQUEST_HEADERS": {
            "A***ept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "A***ept-Language": "zh-***,zh;q=0.8"
        },
        # 添加 JSON 导出用于调试
        "FEEDS": {
            'output.json': {
                'format': 'json',
                'encoding': 'utf8',
                'store_empty': False,
                'indent': 4,
                'overwrite': True
            },
        },
        "CONCURRENT_REQUESTS": 1,
        "LOG_LEVEL": "INFO"
    }

    def parse(self, response):
        """解析人民网社会频道首页"""
        self.logger.info(f"开始处理页面:{response.url}")

        # 计算已爬取页数
        self.page_count += 1

        # 检查页面数量限制
        if self.max_pages and self.page_count > self.max_pages:
            self.logger.info(f"已达到最大页数限制 {self.max_pages},停止爬取")
            return

        # 提取右侧新闻链接
        right_column_news = response.xpath(
            '//div[@class="fr"]//div[@class="news_box"]//a/@href').getall()

        # 提取本网原创栏目下的新闻链接
        original_news_links = response.xpath(
            '//h2[@class="qiehuan1 mt15"]/following-sibling::div[@class="headingNews qiehuan1_c"][1]//h5/a/@href').getall()

        # 合并两个来源的新闻链接
        all_news_links = right_column_news + original_news_links

        # 去重
        unique_links = list(set(all_news_links))

        if not unique_links:
            self.logger.warning("未找到新闻链接")
            return

        # 处理新闻链接
        for link in unique_links:
            # 检查新闻数量限制
            if self.max_news and self.news_count >= self.max_news:
                self.logger.info(f"已达到最大新闻数限制 {self.max_news},停止爬取")
                return

            # 处理相对链接和绝对链接
            if link.startswith('http'):
                abs_url = link
            else:
                abs_url = response.urljoin(link)

            self.news_count += 1
            yield scrapy.Request(
                url=abs_url,
                callback=self.parse_news_page,
                meta={'original_url': response.url}  # 保存原始页面URL用于调试
            )

        # 查找本网原创栏目的下一页链接并跟进
        next_page_link = response.xpath(
            '//div[@class="page_n clearfix"]//a[contains(text(), "下一页")]/@href').get()

        if next_page_link:
            # 确保不超过最大页数限制,并且还需要更多新闻
            if (not self.max_pages or self.page_count < self.max_pages) and \
                    (not self.max_news or self.news_count < self.max_news):
                next_page_url = response.urljoin(next_page_link)
                self.logger.info(f"正在跟进到下一页: {next_page_url}")
                yield scrapy.Request(
                    url=next_page_url,
                    callback=self.parse
                )
            else:
                reason = ""
                if self.max_pages and self.page_count >= self.max_pages:
                    reason = f"已达到最大页数限制 {self.max_pages}"
                elif self.max_news and self.news_count >= self.max_news:
                    reason = f"已达到最大新闻数限制 {self.max_news}"
                self.logger.info(f"{reason},停止爬取")
        else:
            self.logger.info("没有找到下一页链接,爬取结束")

    def parse_news_page(self, response):
        """解析新闻详情页"""
        self.logger.info(f"解析新闻详情:{response.url}")

        # 提取新闻标题 - 尝试多种选择器
        title_selectors = [
            '//h1/text()',
            '//div[@class="title"]/h1/text()',
            '//div[@class="hdNews"]//h5/a/text()'
        ]

        title = ""
        for selector in title_selectors:
            title = response.xpath(selector).get()
            if title:
                title = title.strip()
                break

        # 提取发布时间 - 尝试多种选择器
        time_selectors = [
            '//div[contains(@class, "col-1-1")]/text()',
            '//div[@class="box01"]/div[@class="fl"]/text()',
            '//em[@class="gray"]/text()'
        ]

        publish_time = ""
        for selector in time_selectors:
            time_text = response.xpath(selector).get()
            if time_text:
                publish_time_match = re.search(
                    r'\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}|\d{4}年\d{2}月\d{2}日\d{2}:\d{2}|\d{4}\.\d{2}\.\d{2}\s\d{2}:\d{2}',
                    time_text)
                if publish_time_match:
                    publish_time = publish_time_match.group()
                    break

        # 提取正文内容 - 尝试多种选择器
        content_selectors = [
            '//div[@class="rm_txt_con cf"]//p/text()',
            '//div[@class="show_text"]//p/text()',
            '//div[@class="on"]/em//text()',
            '//div[contains(@class, "text")]//p/text()'
        ]

        content = ""
        for selector in content_selectors:
            content_paragraphs = response.xpath(selector).getall()
            if content_paragraphs:
                content = "".join([p.strip() for p in content_paragraphs if p.strip()])
                if content:
                    break

        # 数据清洗
        title = title.strip() if title else "未找到标题"
        content = content.strip() if content else "未找到内容"
        publish_time = publish_time.strip() if publish_time else "未找到发布时间"

        # 创建数据字典
        item_data = {
            "title": title,
            "publish_time": publish_time,
            "content": content,
            "url": response.url,
            "source_page": response.meta.get('original_url', 'unknown')
        }

        # 判断数据是否完整
        is_***plete = title and title != "未找到标题" and content and content != "未找到内容"

        if is_***plete:
            self.su***ess_count += 1
            # 使用 Crawlab SDK 保存数据
            try:
                save_item(item_data)
                self.logger.info(f"通过 Crawlab SDK 保存数据成功: {title[:50]}...")
            except Exception as e:
                self.logger.error(f"通过 Crawlab SDK 保存数据失败: {str(e)}")
        else:
            self.failed_count += 1
            self.logger.warning(
                f"数据不完整 - 标题: {'有' if title and title != '未找到标题' else '无'}, 内容: {'有' if content and content != '未找到内容' else '无'}: {response.url}")

        # 添加详细的调试日志
        self.logger.info(f"成功生成数据项 - 标题: {title[:50]}...")
        self.logger.info(f"数据详情 - URL: {response.url}, 时间: {publish_time}")

        # 同时打印到控制台,确保 Crawlab 能捕获
        print(f"NEWS_ITEM: {json.dumps(item_data, ensure_ascii=False)}")

        # 仍然 yield item 用于 JSON 导出
        item = NewsItem()
        for key, value in item_data.items():
            item[key] = value
        yield item

    def closed(self, reason):
        """爬虫关闭时的回调函数"""
        # 从stats中获取实际存储的item数量
        stats = self.crawler.stats
        item_scraped_count = stats.get_value('item_scraped_count', 0)

        self.logger.info(f"爬虫结束运行,原因: {reason}")
        self.logger.info(f"统计信息:")
        self.logger.info(f"- 总共找到新闻链接: {self.news_count} 条")
        self.logger.info(f"- 成功解析新闻: {self.su***ess_count} 条")
        self.logger.info(f"- 解析失败新闻: {self.failed_count} 条")
        self.logger.info(f"- 实际存储item数量: {item_scraped_count} 条")
        self.logger.info(f"- 处理页面数量: {self.page_count} 个")

        print(
            f"CRAWLER_FINISHED: 找到链接 {self.news_count} 条, 成功解析 {self.su***ess_count} 条, 实际存储 {item_scraped_count} 条")

2.3、查看自己的爬虫

2.4、把自己的本地爬虫上传到crawlab上,进行执行

2.4.1、在爬虫模块新增一个爬虫项目:

2.4.2、上传爬虫项目

2.4.3、上传完成后,直接点击运行按钮,当运行成功后,直接点击data标签就可以看见自己抓取的数据

转载请说明出处内容投诉
CSS教程网 » crawlab上部署自己的爬虫(scrapy)项目

发表评论

欢迎 访客 发表评论

一个令你着迷的主题!

查看演示 官网购买