最近又到了招聘季,有那么多招聘网站,我们该如何根据关键词找出合适的招聘信息呢?本文根据之前的scrapy爬虫结果,对数据进行过滤,寻找合适的职位信息。
使用Scrapy爬取的职位名称以及对应的业务详情,分别为abstract.json和detail.json。其中
要求是根据指定的关键字列表从数据中提取最近一周的职位信息。
#!/usr/bin/env python # -*- coding:utf-8 -*- # @FileName :wordfilter.py # @Time :2022/1/19 17:25 # @Author :PangXZ import json import re import datetime import numpy as np KEYWORDS = ['xxxx', 'xx', 'xx', 'xxx', 'xxxx'] def load_detail(document): file = open(document, 'r', encoding='utf-8') subject = [] for line in file.readlines(): line = line.rstrip(',\n') content = json.loads(line) title = content['title'] detail = content['detail'] for keyword in KEYWORDS: if keyword in str(detail): title = re.sub(u"\\[.*?]", "", title).strip() subject.append(title) subject = list(set(subject)) return subject def load_abstract(document): file = open(document, 'r', encoding='utf-8') abstract = [] today = datetime.date.today() for line in file.readlines(): line = line.rstrip(',\n') content = json.loads(line) title = content['title'] link = content['link'] date = content['date'] year, month, day = map(int, date.split('-')) if date is not None: cursor = datetime.date(year, month, day) if (today - cursor).days < 7: abstract.append([title, link, date]) return abstract def get_right_job(position, abstract): results = [] source_position = list(np.array(abstract).T[0]) source_link = list(np.array(abstract).T[1]) source_date = list(np.array(abstract).T[2]) target_position = position right_position = list(set(source_position) & set(target_position)) for position in right_position: i = source_position.index(position) results.append([position, source_link[i], source_date[i]]) return results if __name__ == "__main__": detail_json = 'detail.json' Positions = load_detail(document=detail_json) abstract_json = 'abstract.json' Abstracts = load_abstract(document=abstract_json) output = get_right_job(position=Positions, abstract=Abstracts) for pre in output: print(pre)
执行结果如下: