简介
python中cookie用于模拟用户登录。
通过创建 session = request.session(),使用session.get(url=url,headers=headers,params=params,cookies=cookies)
达到在会话过程中,响应头,参数,cookie不丢失的目的
爬取如知乎,微博,17k.com网站时需要登录才能爬取内容。登录爬取内容最大的问题:再重定向时cookie丢失,使用cookie时则不会丢失
使用
# 爬取网站:https://www.zhihu.com/hot url = "https://www.zhihu.com/hot" headers = { "user-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" } params = { } cookies = { "cookie":'q_c1=1a0053080bfa4b0bb52e8d0298ed9361|1632040423000|1632040423000; _zap=b2dda776-aa53-47d9-8c81-d97071deb39c; d_c0="AABdvM5fvxOPTpu0jgT62R0ugD87UK8_0ic=|1632040422"; _9755xjdesxxd_=32; YD00517437729195%3AWM_TID=NmCfMuunUNVAFFQBEVd%2Bns19%2FeZc0lr9; capsion_ticket="2|1:0|10:1632371606|14:capsion_ticket|44:MjVhZDhhZDgyMjhhNGY1MDkwOTQ4MWZlOWZlNmRlMmM=|a5b383ba86b7c72e4ae6deddea7606ad1bcb388eb09bf9d6e6ae3e287d0690c1"; __snaker__id=tpSQA72NeMkGX8FG; l_cap_id="NDNmMzFhMTk2M2U2NGUzYTkxZGU3OTNjNTIyODI3NWY=|1632609832|a456401d5f145a08fd45c2b4f20e06d70b7d59a7"; r_cap_id="YjY2ZGY3MzRjOTQ2NDZhZGEyZjlhMmQ1Yjc3YmVkMGY=|1632609832|019cea7903452682c2abfc4de4e811a07505bd45"; cap_id="M2FhZDk3MGJhYmExNGFmOGJiNGQyZjU0MGJjNDIxNjg=|1632609832|b7a501c6755139295025bde2933ceaf19038b629"; tshl=; gdxidpyhxdE=fQDRrqaryc8Kl6UBhxHZuwjZoPLvMNh7L7eQg%2Fd8pXaOeAkOZYfqNwtEiNos3WdzQaz1ACA5gIoN53H5IfWO90XrDjCJ%2BQuTx9Wn4Dwxz6hmK0E%2B4eAg6ZR1kvMLBBIVh0pBAPYeAbPMyLqgRDV%2BiZs%2F%5CvI46kqr6z4%5CgISHMQn8CoJS%3A1633876420976; YD00517437729195%3AWM_NI=k6HA0iGQq34PY9ufXuUIWkXOJgu7vH%2BoNIpuUPgWlrCleu4KJLATowAFICwWEHwBZ1c7mpvwLQbCcQi80UFPV4%2FlZtl3MFV%2BANuhZD7BJUDKDRuA6ge42ctX44MmLcL5RVA%3D; YD00517437729195%3AWM_NIKE=9ca17ae2e6ffcda170e2e6eedad648b19ca397d550b6968eb3c55f939f8baff57981a7ad89f645fbabacaaf52af0fea7c3b92aa9ea88a2b363858fbfd4b75c9be7bb87db7f9797998cdb458ab9bbd8f544e9b182aaf37f9790a9b8ef7a81b4ab96b7418a948fb5ee7d839a87a4f65d8798b9a3d93ba1e9a391f57d908faea4e25babe800b6c55c96ece18dd980b0ed83a7d5218392bb89d950f59e86b6c741888ebba2b56da7ecfe84f754f8ec838ecf6990eb9eb7c437e2a3; tst=h; captcha_session_v2="2|1:0|10:1633875937|18:captcha_session_v2|88:cko2NXRiaFVObFdRWjlMZFJSZVRzbjFqZzhGcXhxRzVQM2tvcjlsOERob3V5YXpjTzdqZFlLdXBESkk5NEZtZw==|6308f2b0f82755ab69f5bb42c5c5b1629ca117f7e7026dd31ed4e80d392b34bb"; captcha_ticket_v2="2|1:0|10:1633875945|17:captcha_ticket_v2|704:eyJ2YWxpZGF0ZSI6IkNOMzFfalF3SkdHQjBZUjJOTGZsRnd5VWJ5TDJGNy12b3pXbkNQaWJ3THRWMGhRcXZUNGxScHd6QllCazQ0Rk0uUUFPUE1tQm9LYUpZLlpvSW9uSzBJcEV0QTZ3RDRJMklBeTJBdlZ4OEhpblZNVVpzZUFFTjFTMi1BYUxyc0x5MkR3OEpzRlVaUDdaRmN2N0kxU1hTanc4cWx2OWQ5OXdEUjFoanVpOUlYQnota0EuMmplbnJOeUxfcjFYbFI0aXdyZmJaMGdqeTJMb3o5eE9wc2ZfZmJ1aC50cXFTY3VwQUtNREYuTkd5c2NfZFFOTGR0Y1VCOTdSZ2RRMncuVFA5b3ZCd0NFV3BHRWpybjdMcGtYdWdVVDRic3BTNjRxUlNyVnFiTWF6dWtfMmJuX2loTzFCc0RtWGw1UVF4ZDdIX3RPa0hxUUs0Smw2bU11dVh3SXExRWlXUWw1NlJpb05qaVlMdzFqSi1ERkFOZ2V4c09xU3lSRWJQdjVBLjFxbEZ3QUx3aUtMTjBSb29uY1N0WElHcVhpSEprWVZRcUkuMklqRWRnc1FNSFRmTnBmcmlhZmhJR0kwbmxTVWg1NkZLdFNseUxvazhJMGthY2lsNkFDb3JfYnllWG9IQjFHbElOc2t3bUFocHhQcjVnbWFHc3puUks3UF8uZUxxTmJGMyJ9|d4eea3ed8bb182ebadb5d7645c0ce3942fe8c9f0161c9407da7a189e37309651"; z_c0="2|1:0|10:1633875966|4:z_c0|92:Mi4xWVF1TElnQUFBQUFBQUYyOHpsLV9FeVlBQUFCZ0FsVk5fa1ZRWWdESmc1M2xHMFhKLVJzNnlOSTZzdWVmeEpVZzFB|cbb0aa45bb98482749c26199795db902ffceda5967c8a998c707390e5716a24c"; _xsrf=4c056d8f-c3e5-47b2-9d25-813e035c1bce; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1633871431,1633874014,1633874272,1633936488; SESSIONID=9w36GmPcPQvaw4Nc1cZBrfO7WukxNT5Ksi6SAuBTDZs; JOID=UF0SB0g7k1OhiPv8dT1lCIQY6FljBeZjkM6RrhVqxSfD-M61FFd2NMaO_v1xf9X-Wa6wJCLX9rkDdsP0TFHDrn8=; osd=UFwXC0o7klativv9cDFnCIUd5FtjBONvks6QqxloxSbG9My1FVJ6NsaP-_Fzf9T7VaywJSfb9LkCc8_2TFDGon0=; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1633936493; KLBRSID=2177cbf908056c6654e972f5ddc96dc2|1633936617|1633936486' } # 创建会话,同会话进行爬虫. 会话为保证cookie不丢失 session = requests.session() response = session.get(url=url,headers=headers,params=params,cookies=cookies) page_content = response.text pattern = re.compile('<div class="HotItem-content"><a href="[\s\S]*?" title="(?P<news>[\s\S]*?)" target="_blank"[\s\S]*?<p class="HotItem-excerpt">(?P<content>[\s\S]*?)</p>') iterator = pattern.finditer(page_content) for i in iterator: print(i.group('news')) print(i.group('content')) print()