是一个非常强大又灵活的网页解析库
PyQuery 是 Python 仿照 jQuery 的严格实现
语法与 jQuery 几乎完全相同,更多操作可以参考jQuery
pip install pyquery
html = ''' <ul id="container"> <li class="wow fadeIn"> <div class="d-flex latest-small-thumb"> <div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden"> <a class="color-white" href="single.html" tabindex="0"> <img src="assets/imgs/news/thumb-11.jpg" alt=""> </a> </div> <div class="post-content media-body align-self-center"> <h5 class="post-title mb-15 text-limit-3-row font-medium"> <a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a> </h5> </div> </div> </li> </ul> ''' from pyquery import PyQuery as pq doc = pq(html) print(doc) print(type(doc)) print(doc('li'))
from pyquery import PyQuery as pq doc = pq(url="http://www.baidu.com", encoding='utf-8') print(doc('head')
from pyquery import PyQuery as pq doc = pq(filename='index.html') print(doc)
html = ''' <ul id="container"> <li class="wow fadeIn"> <div class="d-flex latest-small-thumb"> <div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden"> <a class="color-white" href="single.html" tabindex="0"> <img src="assets/imgs/news/thumb-11.jpg" alt=""> </a> </div> <div class="post-content media-body align-self-center"> <h5 class="post-title mb-15 text-limit-3-row font-medium"> <a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a> </h5> </div> </div> </li> </ul> ''' from pyquery import PyQuery as pq doc = pq(html) print(doc('#container .fadeIn'))
html = ''' <ul id="container"> <li class="wow fadeIn"> <div class="d-flex latest-small-thumb"> <div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden"> <a class="color-white" href="single.html" tabindex="0"> <img src="assets/imgs/news/thumb-11.jpg" alt=""> </a> </div> <div class="post-content media-body align-self-center"> <h5 class="post-title mb-15 text-limit-3-row font-medium"> <a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a> </h5> </div> </div> </li> </ul> ''' from pyquery import PyQuery as pq doc = pq(html) items = doc('#container') lis = items.find('li') print(type(lis)) print(lis)
html = ''' <ul id="container"> <li class="wow fadeIn"> <div class="d-flex latest-small-thumb"> <div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden"> <a class="color-white" href="single.html" tabindex="0"> <img src="assets/imgs/news/thumb-11.jpg" alt=""> </a> </div> <div class="post-content media-body align-self-center"> <h5 class="post-title mb-15 text-limit-3-row font-medium"> <a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a> </h5> </div> </div> </li> </ul> ''' from pyquery import PyQuery as pq doc = pq(html) div = doc('#container .post-thumb') print(div.siblings())
html = ''' <ul id="container"> <li class="wow fadeIn"> <div class="d-flex latest-small-thumb"> <div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden"> <a class="color-white" href="single.html" tabindex="0"> <img src="assets/imgs/news/thumb-11.jpg" alt=""> </a> </div> <div class="post-content media-body align-self-center"> <h5 class="post-title mb-15 text-limit-3-row font-medium"> <a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a> </h5> </div> </div> </li> </ul> ''' from pyquery import PyQuery as pq doc = pq(html) a = doc('#container .post-content a') print(a) print(a.attr('href')) print(a.attr.href)
html = ''' <ul id="container"> <li class="wow fadeIn"> <div class="d-flex latest-small-thumb"> <div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden"> <a class="color-white" href="single.html" tabindex="0"> <img src="assets/imgs/news/thumb-11.jpg" alt=""> </a> </div> <div class="post-content media-body align-self-center"> <h5 class="post-title mb-15 text-limit-3-row font-medium"> <a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a> </h5> </div> </div> </li> </ul> ''' from pyquery import PyQuery as pq doc = pq(html) a = doc('#container .post-content a').text() print(a)
html = ''' <ul id="container"> <li class="wow fadeIn"> <div class="d-flex latest-small-thumb"> <div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden"> <a class="color-white" href="single.html" tabindex="0"> <img src="assets/imgs/news/thumb-11.jpg" alt=""> </a> </div> <div class="post-content media-body align-self-center"> <h5 class="post-title mb-15 text-limit-3-row font-medium"> <a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a> </h5> </div> </div> </li> </ul> ''' from pyquery import PyQuery as pq doc = pq(html) li = doc('#container li') print(li) li.removeClass('fadeIn') print(li) li.addClass('fadeIn') print(li)