Project Directory

  • run

      scrapy startporject tutorial
    
tutorial/
    scrapy.cfg            # deploy configuration file

    tutorial/             # project's Python module, you'll import your code from here
        __init__.py

        items.py          # project items definition file

        pipelines.py      # project pipelines file

        settings.py       # project settings file

        spiders/          # a directory where you'll later put your spiders
            __init__.py

Config

# -*- coding: utf-8 -*-

BOT_NAME = 'freebuf'

SPIDER_MODULES = ['freebuf.spiders']
NEWSPIDER_MODULE = 'freebuf.spiders'

# Item Piplines
ITEM_PIPELINES = {
        'scrapy.pipelines.images.ImagesPipeline': 1,
        #'freebuf.pipelines.MongoDBPipeline': 100,
        'freebuf.pipelines.FreebufSavePipeline': 200
}

# Save Setting
LOCAL_STORE = './static/'
IMAGES_STORE = LOCAL_STORE
IMAGE_DOWLOAD_IGNORED = {}

# Feed Setting
FEED_URI = './export.csv'
FEED_FORMAT = 'csv'

#MongdoDB Setting

Source

  • Class Spider 用于启动爬虫,控制爬虫行进和使用Item返回需要的数据。
import scrapy
from freebuf.items import FreebufItem
from datetime import datetime

class FreebufSpider(scrapy.Spider):
    name = "freebuf"
    allowed_domains = ["freebuf.com"]
    start_urls = [
            "http://www.freebuf.com/vuls",
    ]

    def parse(self,response):
        for url in response.xpath('//div[@class="news-img"]//a/@href').extract():
            yield scrapy.Request(url, callback=self.parse_page)
        if response.url in self.start_urls:
            base_url = "http://www.freebuf.com/vuls/page/"
            for page in range(2,50):
                url = base_url + str(page)
                yield scrapy.Request(url, callback=self.parse)

    def parse_page(self, response):
        item = FreebufItem();
        item['freebuf_id'] = response.url.split('/')[-1].split('.')[0]
        item['title'] = response.xpath('//title/text()').extract()[0]
        try:
            item['author'] = response.xpath(
                 '//div[@class="title"]//a[@rel="author"]/text()'
                    ).extract()[0]
        except:
            item['author'] = response.xpath(
                 '//div[@class="title"]//span[@class="name"]/text()'
                    ).extract()[0]
        item['html_title'] = response.xpath(
                '//div[@class="title"]'
                ).extract()[0]
        item['html_body'] = response.xpath(
                '//div[@id="contenttxt"]'
                ).extract()[0]
        #response.body
        dt = response.xpath(
                '//div[@class="title"]//span[@class="time"]/text()'
                ).extract()[0].split('-')
        item['datetime'] = datetime(int(dt[0]),int(dt[1]),int(dt[2]))
        item['datetime_update'] = datetime.today()
        item['image_urls'] = []
        item['images'] = []
        for url in response.xpath(
                '//div[@id="contenttxt"]//img/@data-original').extract():
           item['image_urls'].append(url) 

        return item
  • Class Item 用于创建爬取数据的结构
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class FreebufItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    datetime = scrapy.Field()
    datetime_update = scrapy.Field()
    title = scrapy.Field()
    freebuf_id = scrapy.Field()
    author = scrapy.Field()
    html_title = scrapy.Field()
    html_body = scrapy.Field()

    image_urls = scrapy.Field()
    images = scrapy.Field()
  • Class Pipeline 用于处理Spider所返回的Item类,格式化和存储数据。
# -*- coding: utf-8 -*-
import re
from scrapy.conf import settings
from scrapy.contrib.exporter import CsvItemExporter

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html


class FreebufSavePipeline(object):
    def __init__(self):
        self.exporter = None
        self.files = {}

    def open_spider(self, spider):
        self.files = open('./exporter.csv','wb')
        self.exporter = CsvItemExporter(self.files)
        self.exporter.start_exporting()
    
    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.files.close()

    def process_item(self, item, spider):
        page_name = settings['LOCAL_STORE'] + 'freebuf_' + \
                item['freebuf_id'] + '.html'
        html = item['html_body']
        for img in item['images']:
            html = re.sub(r'<img [^>]*%s[^>]*>' % img['url'], 
                    '<img src="%s">' % ('./' + img['path']),
                    html)
        #html = html.replace(
        #        'http://static.3001.net/css/new/bootstrap.min.css',
        #        './css/bootstrap.min.css')
        html = '<!DOCTYPE html><html><head>' + \
                '<meta charset="utf-8"/><title>' + \
                item['title'] + \
                '</title><link rel="stylesheet" type="text/css" href="./css/bootstrap.min.css"/>' + \
                '</head><body><div class="container">' + \
                item['html_title'] + html + '</div></body></html>'
        with open(page_name, 'wb') as f:
            f.write(html.encode('utf-8','ignore'))
        item['html_body'] = None
        item['html_title'] = None
        item['image_urls'] = None
        item['images'] = None
        self.exporter.export_item(item)
        return item