Java教程

数据采集之贝壳新房【完整代码(数据库+图片)】

本文主要是介绍数据采集之贝壳新房【完整代码(数据库+图片)】,对大家解决编程问题具有一定的参考价值,需要的程序猿们随着小编来一起学习吧!

细节问题参考前两章。
在settings.py中设置图片和数据库

BOT_NAME = 'houses'
SPIDER_MODULES = ['houses.spiders']
NEWSPIDER_MODULE = 'houses.spiders'


ROBOTSTXT_OBEY = False
LOG_LEVEL="WARNING"  #日志为警告以上才显示
DOWNLOAD_DELAY = 3

ITEM_PIPELINES = {
   'houses.pipelines.MysqlPipeline': 100,
   'houses.pipelines.HouseImagePipeline': 200, # 图片下载模型
}
IMAGES_STORE='images'   #图片路径【注意】
IMAGES_EXPIRES=90
IMAGES_MIN_HEIGHT=100
IMAGES_MIN_WIDTH=100

MYSQL_DB_HOST="127.0.0.1"
MYSQL_DB_PORT=3306  #端口
MYSQL_DB_NAME="spier"
MYSQL_DB_USER="root"
MYSQL_DB_PASSWORD="123456"

打开cmd,添加表【数据库】

cmd
C:\Users\admin>mysql -u root -p
mysql> show databases;
mysql> use spier;
mysql> create table HouseInfo(house varchar(255),address varchar(255),price varchar(255),total varchar(255))ENGINE=InnoDB DEFAULT CHARSET=utf8;  //建表

pipelines.py

from scrapy.http import Request
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
import pymysql
class MysqlPipeline:
    def open_spider(self,spider):
        # 读取settings.py中的配置项
        host = spider.settings.get("MYSQL_DB_HOST")
        port = spider.settings.get("MYSQL_DB_PORT")
        dbname = spider.settings.get("MYSQL_DB_NAME")
        user = spider.settings.get("MYSQL_DB_USER")
        pwd = spider.settings.get("MYSQL_DB_PASSWORD")
        # 创建数据库链接
        self.db_conn = pymysql.connect(host=host, port=port, db=dbname, user=user, password=pwd)
        # 打开游标
        self.db_cur = self.db_conn.cursor()
    def process_item(self, item, spider):
        values = (
            item["house"],
            item["address"],
            item["price"],
            item["total"])  # 与占位符%s对应的数据
        # sql语句,数据部分使用占位符%s代替
        sql = "insert into HouseInfo(house,address,price,total) values(%s,%s,%s,%s)"
        self.db_cur.execute(sql, values)  # 执行SQL语句
        return item
    def close_spider(self, spider):
        self.db_conn.commit()  # 提交事务
        self.db_cur.close()  # 关闭游标
        self.db_conn.close()  # 关闭数据库连接
class HouseImagePipeline(ImagesPipeline):
    def get_media_requests(self, item, info): #请求下载指定item对象数据
        # for image_url in item["image_urls"]:
        #     yield Request(image_url)
        yield Request(item["image_urls"])
    def item_completed(self, results, item, info):#对下载结果进行处理
        # results - 下载好的资源参数([(True, {'url': 'https://img.mukewang.com/5861d2500001d39406000338-240-135.jpg',
        #                             'path': 'full/6922b98c7acde37f0b570650844e2e660b82991a.jpg',
        #                             'checksum': '037f4f643599f3e7870225798ece845b', 'status': 'downloaded'})])
        # item - 被爬取的item对象
        image_path=[x['path'] for ok,x in results if ok]
        # print(image_path)
        if not image_path:
            raise DropItem("items contains no images")
        item["image_path"]=image_path[0]
        return item

items.py

import scrapy
class HousesItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    house = scrapy.Field()  # 楼盘名称
    address = scrapy.Field()  # 地址
    price = scrapy.Field()  # 均价
    total = scrapy.Field()  # 总价
    image_urls = scrapy.Field()  #图片url地址
    image_path=scrapy.Field  #图片的本地路径

beike.py

import scrapy
from houses.items import HousesItem

class ShellnewsSpider(scrapy.Spider):
    name = 'beike'
    allowed_domains = ['cq.fang.ke.com']
    start_urls = ['http://cq.fang.ke.com/loupan/pg{}'.format(i) for i in range(7, 8)]

    def parse(self, response):
        allli=response.xpath("//ul[@class='resblock-list-wrapper']/li")
        for row in allli:
            # 使用索引进行快速定位
            item = HousesItem()  # 初始化容器必须放在循环内
            item["house"] = row.xpath("div/div[1]/a/text()").get().strip()  # 楼盘
            item["address"] = row.xpath("div/a[1]/@title").get().strip()  # 地址
            item["price"] = row.xpath(".//span[@class='number']/text()").get()  # 均价
            total = row.xpath(".//div[@class='second']/text()").get()  # 总价
            # 简单清洗数据,去掉总价2个字
            total = total.replace("总价", "") if total is not None else ""
            item["total"] = total
            item["image_urls"] = row.xpath("a/img/@data-original").get()  # 图片的地址
            yield item

main.py

from scrapy import cmdline
cmdline.execute("scrapy crawl beike".split())

运行
1、可以直接运行main.py【main.py 可以用于调试】
2、在终端控制台,输入命令:

scrapy crawl beike -o 【要保存在某文件中】
scrapy crawl beike -o beike.csv

运行结果:
在这里插入图片描述
在这里插入图片描述

这篇关于数据采集之贝壳新房【完整代码(数据库+图片)】的文章就介绍到这儿,希望我们推荐的文章对大家有所帮助,也希望大家多多支持为之网!