本文是对本专栏的简单复习。
至此,本专栏已经完结。后续最多只是修补一些内容上的bug了。
写这个专栏的初衷其实是为了自己的复习,如果这些内容对你的学习能起到帮助,那便是我的荣幸。
最后的感悟大概就是:
依然要勤学苦练,最终与实践结合。我们学习代码这个工具就是为了实战使用,而不只是学习函数。
一定要持之以恒的学习,并与实践融合。
本文于2021/12/22首发于CSDN,有不足请指出。
'''1.爬虫''' import chardet import requests url = '' ua = {"User-Agent": " "} rqg = requests.get(url, headers=ua) rqg.encoding = chardet.detect(rqg.content)['encoding'] html = rqg.content.decode('utf-8') from bs4 import BeautifulSoup from lxml import etree soup = BeautifulSoup(html, 'lxml') tag = soup.ul tag.attrs tag.li.get_text() tag.get('herf') soup.find_all('ul') urls = [] herfs = [] for i in tag.find_all('a'): urls.append(i.get_text()) herfs.apepend(i.get('herf')) for i in tag.find_all('a'): print(i.get('herf'), i.get_text(), end ='\n' ) xp = etree.HTML(html, parser=etree.HTMLParser(encoding='utf-8')) xp.xpath('//a') xp.xpath("body/div/a[starts-with(@id,'co')]") xp.xpath("//a/text()") j = xp.xpath('//p[starts-with(@id)]') for i in j: t = i.xpath('string(.)') print(t) import re title_pattern = r'<title (.*?)>(.*?)</title>' title_com = re.compile(title_pattern, re.M|re.S) title_find = re.findall(title_com, rqg.text) import time from selenium import webdiver div = webdiver.Chrome('./chromediver') div.get(url) time.sleep(5) html = div.page_source element = div.find_element_by_id('pass') # 只返回第一个 elements = div.find_elements_by_name('a') # 返回列表 element2 = div.find_element_by_xpath("//p[@id='pass]") element3 = div.find_element_by_tag_name('div') import time import requests import json url = '' ua = {"User-Agent": ' '} # html = requests.get(url, headers=ua).content.decode('utf-8') html = requests.get(url, headers=ua).text data = json.loads(html) dic = data['data'] for i in dic: print(i['picPath'], i['bookName']) import requests import time from selenium import webdiver div1 = webdiver.Chrome('./chromediver') div1.get(url) time.sleep(5) # html = div.page_source e1 = div.find_elements_by_xpath("//div[@class='book']") for book in e1: print(book.text) div.quit() '''2.写入文件''' with open(r'c:\file.txt', 'a+') as f: f.write(rqg.text) import xlwings as xw wb = xw.Book(r"c:\excel.xlsx") sht = wb.sheets['Sheet1'] sht.range('A2').value = ['aaa', 'nnn'] # A2单元格,value值也可以是dataframe '''3.pandas''' import numpy as np import pandas as pd np.random.random((4, 5)) # [0, 1)浮点数 np.random.rand(4, 5) # 均匀分布 np.random.randn(4, 5) # (10) 一行十个数数列 正态分布 np.random.randint(5, 10, size=[2, 5]) # [5,10]整数 arr1 = np.arange(4) arr1.ravel() # 列向展平 arr1.flatten arr2 = np.arange(1) arr_st = np.concatenate((arr1, arr2), axis=0) # axis这是纵向(行向)叠加 arr_sp = np.split(arr1, 2, axis=1) # 横向(列向)切割 行0,列1 # header=0,无列标题时默认使用。 # 如果有列标题时强行用0,会替换掉列名,列名下面一行当标题。 # =None,有列标题时默认使用 df1 = pd.read_csv(r"c:/df.csv", header=None, index_col='city') df1.values df1.index df1.columns df1.dtypes df1.size df1.ndim df1.shape df1.describe() df1.info() df1['city'].mean() # var, std # df2 = df1.set_index('city') df1.loc[:, ['city', 'sex']] df1[['city', 'sex']] df1.iloc[:, :2] df1[:2] # df1[:, :2] 不对 # df1[0] 不对 df1.loc[df1] df1[(df1["city"] == '北京') & (df1["sex"] == 'female')] data = {"city":'lanzhou', "sex":"female"} df1.append(data, ignore_index=True) # 防止索引冲突 df1['age'] = [20, 19, 21] df1.drop([1, 3]) # 删除1,3行 df1.drop(columns=["age", "city"]) # index= ,或用axis pd.to_csv(r"C:\i.csv", sep=',') groupby = df1.groupby('分公司')[['薪水', '小时报酬']].agg['min', 'max'] pivot_table = df1.pivot_table(values=['小时报酬', '薪水'], index=['分公司', '部门']) concat_join = pd.concat([df1[:2], df1[2:]], axis=1, join='inner') # inner内连接去除悬浮元组,outer保留悬浮元组 merge = pd.merge(df1[:2], df1[2:], left_on='学号', right_on="学号") combine = df1.combine_first(df2) # 对比合并重复数据 df1["姓名"].drop_duplicates() # 去除重复值 df1.isnull() # notnull df1.dropna(axis=0, how='all') # all行全缺失值才删,any有就删 df1['小时报酬'].fillna(df1['小时报酬'].mean()) # inter1d, make_interp_spline from scipy.interplote import lagrange l1 = lagrange(x, y1) l1([6, 7]) # x=6,7时,y1的结果 def outRange(ser): bool = (ser < ser.mean() -3*ser.std()) | (ser > ser.mean() + 3*ser.std()) index = np.arange(ser.shape[0])[bool] outrange = ser.iloc[index] return outrange outlier = outRange(df1["age"]) pd.get_dummies(df1["name"]) # 哑变量 import matplotlib.pyplot as plt plt.bar() # barh,plot,boxplot,stackplot,hist,pie,scatter,polar,errorbar plt.figure() plt.xlabel('x轴标签') plt.xticks(['a', 'b', 'c']) # 设置刻度标签 plt.title('设置标题') plt.legend() # 会自动显示 plt.legend(lines, ['线条1', "线条2"], loc='best') plt.grid(visible=True) # 显示网格 ax, fig = plt.subplots(2, 2, figsize=(10,5)) ax1 = ax[1, 0] ax1.plot() # 画ax1的图