from bs4 import BeautifulSoup, element # 演示用html文本 html = """ <html> <head> <title>The Dormouse's story</title> </head> <body> <p class="title" name="dromouse"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well. </p> <p class="story">...</p> </body> </html> """ soup = BeautifulSoup(html, "html.parser") # print(soup.prettify()) p1 = soup.find('p') print(f'1a: {p1.text}') print(f'1b: {soup.p.text}') print(f'1c: {soup.p.string}') print(f'2: {soup.title}') print(f'3: {soup.head}') print(f'4: {soup.p}') print(f'5: {soup.head.name}') print(f'6: {soup.head.text.strip()}') print(f'7: {soup.p.attrs}') soup.p['class'] = "newClass" print(f"8a: {soup.p['class']}") print(f"8b: {soup.p.get('class')}") print(f'9a: {soup.a.attrs["href"]}') print(f'9b: {soup.a["href"]}') e1 = soup.find(class_='story') print(f'10a: {e1}') print(f'10b: {e1.text}') # 不同于string print('-' * 60) ap_list = soup.find_all(['a', 'p']) print(len(ap_list)) for ap in ap_list: print(ap) print('+' * 60) print(soup.find_all(text=["Elsie", "Lacie"])) print('!' * 60) li = soup.find_all(id='link1') print(len(li)) print(li) print(f'11a: {li[0].text}') print(f'11b: {li[0].string}') if type(li[0].string) == element.Comment: print('这是注释') print(soup.find_all(attrs={"href": "http://example.com/lacie"})) print(soup.select("head > title")) print(soup.select('p #link1'))
输出结果
"D:\Program Files\Python\python.exe" C:/Users/issuser/PycharmProjects/pythonProject/10/soup001.py 1a: The Dormouse's story 1b: The Dormouse's story 1c: The Dormouse's story 2: <title>The Dormouse's story</title> 3: <head> <title>The Dormouse's story</title> </head> 4: <p class="title" name="dromouse"><b>The Dormouse's story</b></p> 5: head 6: The Dormouse's story 7: {'class': ['title'], 'name': 'dromouse'} 8a: newClass 8b: newClass 9a: http://example.com/elsie 9b: http://example.com/elsie 10a: <p class="story">Once upon a time there were three little sisters; and their names were <a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> 10b: Once upon a time there were three little sisters; and their names were , Lacie and Tillie; and they lived at the bottom of a well. ------------------------------------------------------------ 6 <p class="newClass" name="dromouse"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a> <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> <p class="story">...</p> ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ ['Lacie'] !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 1 [<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>] 11a: 11b: Elsie 这是注释 [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] [<title>The Dormouse's story</title>] [<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>]