import json
import os
from py2neo import Graph, Node, Relationship
class GoodsKg:
def init(self):
cur = ‘/’.join(os.path.abspath(file).split(’/’)[:-1])
self.data_path = os.path.join(cur, ‘data/goods_info.json’)
self.g = Graph(
host=“127.0.0.1”, # neo4j 搭载服务器的ip地址,ifconfig可获取到
http_port=7474, # neo4j 服务器监听的端口号
user=“lhy”, # 数据库user name,如果没有更改过,应该是neo4j
password=“lhy123”)
return
'''读取数据''' def read_data(self): rels_goods = [] rels_brand = [] goods_attrdict = {} concept_goods = set() concept_brand = set() count = 0 for line in open(self.data_path): count += 1 print(count) line = line.strip() data = json.loads(line) first_class = data['fisrt_class'].replace("'",'') second_class = data['second_class'].replace("'",'') third_class = data['third_class'].replace("'",'') attr = data['attrs'] concept_goods.add(first_class) concept_goods.add(second_class) concept_goods.add(third_class) rels_goods.append('@'.join([second_class, 'is_a', '属于', first_class])) rels_goods.append('@'.join([third_class, 'is_a', '属于', second_class])) if attr and '品牌' in attr: brands = attr['品牌'].split(';') for brand in brands: brand = brand.replace("'",'') concept_brand.add(brand) rels_brand.append('@'.join([brand, 'sales', '销售', third_class])) goods_attrdict[third_class] = {name:value for name,value in attr.items() if name != '品牌'} return concept_brand, concept_goods, rels_goods, rels_brand '''构建图谱''' def create_graph(self): concept_brand, concept_goods, rels_goods, rels_brand = self.read_data() # print('creating nodes....') # self.create_node('Product', concept_goods) # self.create_node('Brand', concept_brand) # print('creating edges....') # self.create_edges(rels_goods, 'Product', 'Product') self.create_edges(rels_brand, 'Brand', 'Product') return '''批量建立节点''' def create_node(self, label, nodes): pairs = [] bulk_size = 1000 batch = 0 bulk = 0 batch_all = len(nodes)//bulk_size print(batch_all) for node_name in nodes: sql = """CREATE(:%s {name:'%s'})""" % (label, node_name) pairs.append(sql) bulk += 1 if bulk % bulk_size == 0 or bulk == batch_all+1: sqls = '\n'.join(pairs) self.g.run(sqls) batch += 1 print(batch*bulk_size,'/', len(nodes), 'finished') pairs = [] return '''构造图谱关系边''' def create_edges(self, rels, start_type, end_type): batch = 0 count = 0 for rel in set(rels): count += 1 rel = rel.split('@') start_name = rel[0] end_name = rel[3] rel_type = rel[1] rel_name = rel[2] sql = 'match (m:%s), (n:%s) where m.name = "%s" and n.name = "%s" create (m)-[:%s{name:"%s"}]->(n)' %(start_type, end_type, start_name, end_name,rel_type,rel_name) try: self.g.run(sql) except Exception as e: print(e) if count%10 == 0: print(count) return
if name ==‘main’:
handler = GoodsKg()
handler.create_graph()
#!/usr/bin/env python3
import urllib.request
from urllib.parse import quote_plus
from lxml import etree
import gzip
import chardet
import json
import pymongo
class GoodSchema:
def init(self):
self.conn = pymongo.MongoClient()
return
'''获取搜索页''' def get_html(self, url): headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/600.5.17 (KHTML, like Gecko) Version/8.0.5 Safari/600.5.17"} try: req = urllib.request.Request(url, headers=headers) data = urllib.request.urlopen(req).read() coding = chardet.detect(data) html = data.decode(coding['encoding']) except: req = urllib.request.Request(url, headers=headers) data = urllib.request.urlopen(req).read() html = data.decode('gbk') return html '''获取详情页''' def get_detail_html(self, url): headers = { "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "accept-encoding": "gzip, deflate, br", "accept-language": "en-US,en;q=0.9", "cache-control": "max-age=0", "referer": "https://www.jd.com/allSort.aspx", "upgrade-insecure-requests": 1, "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/66.0.3359.181 Chrome/66.0.3359.181 Safari/537.36" } try: req = urllib.request.Request(url, headers=headers) data = urllib.request.urlopen(req).read() html = gzip.decompress(data) coding = chardet.detect(html) html = html.decode(coding['encoding']) except Exception as e: req = urllib.request.Request(url, headers=headers) data = urllib.request.urlopen(req).read() html = gzip.decompress(data) html = html.decode('gbk') return html '''根据主页获取数据''' def home_list(self): url = 'https://www.jd.com/allSort.aspx' html = self.get_html(url) selector = etree.HTML(html) divs = selector.xpath('//div[@class= "category-item m"]') for indx, div in enumerate(divs): first_name = div.xpath('./div[@class="mt"]/h2/span/text()')[0] second_classes = div.xpath('./div[@class="mc"]/div[@class="items"]/dl') for dl in second_classes: second_name = dl.xpath('./dt/a/text()')[0] third_classes = ['https:' + i for i in dl.xpath('./dd/a/@href')] third_names = dl.xpath('./dd/a/text()') for third_name, url in zip(third_names, third_classes): try: attr_dict = self.parser_goods(url) attr_brand = self.collect_brands(url) attr_dict.update(attr_brand) data = {} data['fisrt_class'] = first_name data['second_class'] = second_name data['third_class'] = third_name data['attrs'] = attr_dict self.conn['goodskg']['data'].insert(data) print(indx, len(divs), first_name, second_name, third_name) except Exception as e: print(e) return '''解析商品数据''' def parser_goods(self, url): html = self.get_detail_html(url) selector = etree.HTML(html) title = selector.xpath('//title/text()') attr_dict = {} other_attrs = ''.join([i for i in html.split('\n') if 'other_exts' in i]) other_attr = other_attrs.split('other_exts =[')[-1].split('];')[0] if other_attr and 'var other_exts ={};' not in other_attr: for attr in other_attr.split('},'): if '}' not in attr: attr = attr + '}' data = json.loads(attr) key = data['name'] value = data['value_name'] attr_dict[key] = value attr_divs = selector.xpath('//div[@class="sl-wrap"]') for div in attr_divs: attr_name = div.xpath('./div[@class="sl-key"]/span/text()')[0].replace(':','') attr_value = ';'.join([i.replace(' ','') for i in div.xpath('./div[@class="sl-value"]/div/ul/li/a/text()')]) attr_dict[attr_name] = attr_value return attr_dict '''解析品牌数据''' def collect_brands(self, url): attr_dict = {} brand_url = url + '&sort=sort_rank_asc&trans=1&md=1&my=list_brand' html = self.get_html(brand_url) if 'html' in html: return attr_dict data = json.loads(html) brands = [] if 'brands' in data and data['brands'] is not None: brands = [i['name'] for i in data['brands']] attr_dict['品牌'] = ';'.join(brands) return attr_dict
if name == ‘main’:
handler = GoodSchema()
handler.home_list()