细节问题参考前两章。
在settings.py中设置图片和数据库
BOT_NAME = 'houses' SPIDER_MODULES = ['houses.spiders'] NEWSPIDER_MODULE = 'houses.spiders' ROBOTSTXT_OBEY = False LOG_LEVEL="WARNING" #日志为警告以上才显示 DOWNLOAD_DELAY = 3 ITEM_PIPELINES = { 'houses.pipelines.MysqlPipeline': 100, 'houses.pipelines.HouseImagePipeline': 200, # 图片下载模型 } IMAGES_STORE='images' #图片路径【注意】 IMAGES_EXPIRES=90 IMAGES_MIN_HEIGHT=100 IMAGES_MIN_WIDTH=100 MYSQL_DB_HOST="127.0.0.1" MYSQL_DB_PORT=3306 #端口 MYSQL_DB_NAME="spier" MYSQL_DB_USER="root" MYSQL_DB_PASSWORD="123456"
打开cmd,添加表【数据库】
cmd C:\Users\admin>mysql -u root -p mysql> show databases; mysql> use spier; mysql> create table HouseInfo(house varchar(255),address varchar(255),price varchar(255),total varchar(255))ENGINE=InnoDB DEFAULT CHARSET=utf8; //建表
pipelines.py
from scrapy.http import Request from scrapy.exceptions import DropItem from scrapy.pipelines.images import ImagesPipeline import pymysql class MysqlPipeline: def open_spider(self,spider): # 读取settings.py中的配置项 host = spider.settings.get("MYSQL_DB_HOST") port = spider.settings.get("MYSQL_DB_PORT") dbname = spider.settings.get("MYSQL_DB_NAME") user = spider.settings.get("MYSQL_DB_USER") pwd = spider.settings.get("MYSQL_DB_PASSWORD") # 创建数据库链接 self.db_conn = pymysql.connect(host=host, port=port, db=dbname, user=user, password=pwd) # 打开游标 self.db_cur = self.db_conn.cursor() def process_item(self, item, spider): values = ( item["house"], item["address"], item["price"], item["total"]) # 与占位符%s对应的数据 # sql语句,数据部分使用占位符%s代替 sql = "insert into HouseInfo(house,address,price,total) values(%s,%s,%s,%s)" self.db_cur.execute(sql, values) # 执行SQL语句 return item def close_spider(self, spider): self.db_conn.commit() # 提交事务 self.db_cur.close() # 关闭游标 self.db_conn.close() # 关闭数据库连接 class HouseImagePipeline(ImagesPipeline): def get_media_requests(self, item, info): #请求下载指定item对象数据 # for image_url in item["image_urls"]: # yield Request(image_url) yield Request(item["image_urls"]) def item_completed(self, results, item, info):#对下载结果进行处理 # results - 下载好的资源参数([(True, {'url': 'https://img.mukewang.com/5861d2500001d39406000338-240-135.jpg', # 'path': 'full/6922b98c7acde37f0b570650844e2e660b82991a.jpg', # 'checksum': '037f4f643599f3e7870225798ece845b', 'status': 'downloaded'})]) # item - 被爬取的item对象 image_path=[x['path'] for ok,x in results if ok] # print(image_path) if not image_path: raise DropItem("items contains no images") item["image_path"]=image_path[0] return item
items.py
import scrapy class HousesItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() house = scrapy.Field() # 楼盘名称 address = scrapy.Field() # 地址 price = scrapy.Field() # 均价 total = scrapy.Field() # 总价 image_urls = scrapy.Field() #图片url地址 image_path=scrapy.Field #图片的本地路径
beike.py
import scrapy from houses.items import HousesItem class ShellnewsSpider(scrapy.Spider): name = 'beike' allowed_domains = ['cq.fang.ke.com'] start_urls = ['http://cq.fang.ke.com/loupan/pg{}'.format(i) for i in range(7, 8)] def parse(self, response): allli=response.xpath("//ul[@class='resblock-list-wrapper']/li") for row in allli: # 使用索引进行快速定位 item = HousesItem() # 初始化容器必须放在循环内 item["house"] = row.xpath("div/div[1]/a/text()").get().strip() # 楼盘 item["address"] = row.xpath("div/a[1]/@title").get().strip() # 地址 item["price"] = row.xpath(".//span[@class='number']/text()").get() # 均价 total = row.xpath(".//div[@class='second']/text()").get() # 总价 # 简单清洗数据,去掉总价2个字 total = total.replace("总价", "") if total is not None else "" item["total"] = total item["image_urls"] = row.xpath("a/img/@data-original").get() # 图片的地址 yield item
main.py
from scrapy import cmdline cmdline.execute("scrapy crawl beike".split())
运行
1、可以直接运行main.py【main.py 可以用于调试】
2、在终端控制台,输入命令:
scrapy crawl beike -o 【要保存在某文件中】 scrapy crawl beike -o beike.csv
运行结果: