Project Directory
-
run
scrapy startporject tutorial
tutorial/
scrapy.cfg # deploy configuration file
tutorial/ # project's Python module, you'll import your code from here
__init__.py
items.py # project items definition file
pipelines.py # project pipelines file
settings.py # project settings file
spiders/ # a directory where you'll later put your spiders
__init__.py
Config
# -*- coding: utf-8 -*-
BOT_NAME = 'freebuf'
SPIDER_MODULES = ['freebuf.spiders']
NEWSPIDER_MODULE = 'freebuf.spiders'
# Item Piplines
ITEM_PIPELINES = {
'scrapy.pipelines.images.ImagesPipeline': 1,
#'freebuf.pipelines.MongoDBPipeline': 100,
'freebuf.pipelines.FreebufSavePipeline': 200
}
# Save Setting
LOCAL_STORE = './static/'
IMAGES_STORE = LOCAL_STORE
IMAGE_DOWLOAD_IGNORED = {}
# Feed Setting
FEED_URI = './export.csv'
FEED_FORMAT = 'csv'
#MongdoDB Setting
Source
- Class Spider 用于启动爬虫,控制爬虫行进和使用Item返回需要的数据。
import scrapy
from freebuf.items import FreebufItem
from datetime import datetime
class FreebufSpider(scrapy.Spider):
name = "freebuf"
allowed_domains = ["freebuf.com"]
start_urls = [
"http://www.freebuf.com/vuls",
]
def parse(self,response):
for url in response.xpath('//div[@class="news-img"]//a/@href').extract():
yield scrapy.Request(url, callback=self.parse_page)
if response.url in self.start_urls:
base_url = "http://www.freebuf.com/vuls/page/"
for page in range(2,50):
url = base_url + str(page)
yield scrapy.Request(url, callback=self.parse)
def parse_page(self, response):
item = FreebufItem();
item['freebuf_id'] = response.url.split('/')[-1].split('.')[0]
item['title'] = response.xpath('//title/text()').extract()[0]
try:
item['author'] = response.xpath(
'//div[@class="title"]//a[@rel="author"]/text()'
).extract()[0]
except:
item['author'] = response.xpath(
'//div[@class="title"]//span[@class="name"]/text()'
).extract()[0]
item['html_title'] = response.xpath(
'//div[@class="title"]'
).extract()[0]
item['html_body'] = response.xpath(
'//div[@id="contenttxt"]'
).extract()[0]
#response.body
dt = response.xpath(
'//div[@class="title"]//span[@class="time"]/text()'
).extract()[0].split('-')
item['datetime'] = datetime(int(dt[0]),int(dt[1]),int(dt[2]))
item['datetime_update'] = datetime.today()
item['image_urls'] = []
item['images'] = []
for url in response.xpath(
'//div[@id="contenttxt"]//img/@data-original').extract():
item['image_urls'].append(url)
return item
- Class Item 用于创建爬取数据的结构
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class FreebufItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
datetime = scrapy.Field()
datetime_update = scrapy.Field()
title = scrapy.Field()
freebuf_id = scrapy.Field()
author = scrapy.Field()
html_title = scrapy.Field()
html_body = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()
- Class Pipeline 用于处理Spider所返回的Item类,格式化和存储数据。
# -*- coding: utf-8 -*-
import re
from scrapy.conf import settings
from scrapy.contrib.exporter import CsvItemExporter
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class FreebufSavePipeline(object):
def __init__(self):
self.exporter = None
self.files = {}
def open_spider(self, spider):
self.files = open('./exporter.csv','wb')
self.exporter = CsvItemExporter(self.files)
self.exporter.start_exporting()
def close_spider(self, spider):
self.exporter.finish_exporting()
self.files.close()
def process_item(self, item, spider):
page_name = settings['LOCAL_STORE'] + 'freebuf_' + \
item['freebuf_id'] + '.html'
html = item['html_body']
for img in item['images']:
html = re.sub(r'<img [^>]*%s[^>]*>' % img['url'],
'<img src="%s">' % ('./' + img['path']),
html)
#html = html.replace(
# 'http://static.3001.net/css/new/bootstrap.min.css',
# './css/bootstrap.min.css')
html = '<!DOCTYPE html><html><head>' + \
'<meta charset="utf-8"/><title>' + \
item['title'] + \
'</title><link rel="stylesheet" type="text/css" href="./css/bootstrap.min.css"/>' + \
'</head><body><div class="container">' + \
item['html_title'] + html + '</div></body></html>'
with open(page_name, 'wb') as f:
f.write(html.encode('utf-8','ignore'))
item['html_body'] = None
item['html_title'] = None
item['image_urls'] = None
item['images'] = None
self.exporter.export_item(item)
return item