时间:2022-10-26 09:57:05 | 栏目:Python代码 | 点击:次
pip install bs4 pip install lxml pip install html5lib
html = ''' <div class="modal-dialog"> <div class="modal-content"> <div class="modal-header"> <button type="button" class="close" data-dismiss="modal">×</button> <h4 class="modal-title">Modal title</h4> </div> <div class="modal-body"> ... </div> <div class="modal-footer"> <a href="#" rel="external nofollow" rel="external nofollow" class="btn btn-default" data-dismiss="modal">Close</a> <a href="#" rel="external nofollow" rel="external nofollow" class="btn btn-primary">Save</a> </div> </div> </div> ''' from bs4 import BeautifulSoup soup = BeautifulSoup(html, 'html.parser') #prettify()用于格式化输出html/xml文档 print(soup.prettify())
from bs4 import BeautifulSoup fp = open('html_doc.html', encoding='utf8') soup = BeautifulSoup(fp, 'lxml')
from bs4 import BeautifulSoup soup = BeautifulSoup('<p class="name nickname user"><b>i am autofelix</b></p>', 'html.parser') #获取整个p标签的html代码 print(soup.p) #获取b标签 print(soup.p.b) #获取p标签内容,使用NavigableString类中的string、text、get_text() print(soup.p.text) #返回一个字典,里面是多有属性和值 print(soup.p.attrs) #查看返回的数据类型 print(type(soup.p)) #根据属性,获取标签的属性值,返回值为列表 print(soup.p['class']) #给class属性赋值,此时属性值由列表转换为字符串 soup.p['class']=['Web','Site'] print(soup.p)
html = """ <html> <head> <title>零基础学编程</title> </head> <body> <p class="intro"><b>i am autofelix</b></p> <p class="nickname">飞兔小哥</p> <a href="https://autofelix.blog.csdn.net" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" id="csdn">csdn主页</a> <a href="https://xie.infoq.cn/u/autofelix/publish" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" id="infoq">infoq主页</a> <a href="https://blog.51cto.com/autofelix" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" id="51cto">51cto主页</a> <p class="attention">跪求关注 一键三连</p> <p class="introduce"> <a href="https://www.cnblogs.com/autofelix" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" id="cnblogs">博客园主页</a> </p> </body> </html> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html, 'html.parser') #根据元素标签查找 print(soup.select('nickname')) #根据属性选择器查找 print(soup.select('a[href]')) #根据类查找 print(soup.select('.attention')) #后代节点查找 print(soup.select('html head title')) #查找兄弟节点 print(soup.select('p + a')) #根据id选择p标签的兄弟节点 print(soup.select('p ~ #csdn')) #nth-of-type(n)选择器,用于匹配同类型中的第n个同级兄弟元素 print(soup.select('p ~ a:nth-of-type(1)')) #查找子节点 print(soup.select('p > a')) print(soup.select('.introduce > #cnblogs'))
html = """ <html> <head> <title>零基础学编程</title> </head> <body> <p class="intro"><b>i am autofelix</b></p> <p class="nickname">飞兔小哥</p> <a href="https://autofelix.blog.csdn.net" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" id="csdn">csdn主页</a> <a href="https://xie.infoq.cn/u/autofelix/publish" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" id="infoq">infoq主页</a> <a href="https://blog.51cto.com/autofelix" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" id="51cto">51cto主页</a> <p class="attention">跪求关注 一键三连</p> <p class="introduce"> <a href="https://www.cnblogs.com/autofelix" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" id="cnblogs">博客园主页</a> </p> </body> </html> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html, 'html.parser') body_tag=soup.body print(body_tag) # 以列表的形式输出,所有子节点 print(body_tag.contents) # children 用来遍历子节点 for child in body_tag.children: print(child)
html = """ <html> <head> <title>零基础学编程</title> </head> <body> <p class="intro"><b>i am autofelix</b></p> <p class="nickname">飞兔小哥</p> <a href="https://autofelix.blog.csdn.net" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" id="csdn">csdn主页</a> <a href="https://xie.infoq.cn/u/autofelix/publish" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" id="infoq">infoq主页</a> <a href="https://blog.51cto.com/autofelix" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" id="51cto">51cto主页</a> <p class="attention">跪求关注 一键三连</p> <p class="introduce"> <a href="https://www.cnblogs.com/autofelix" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" id="cnblogs">博客园主页</a> </p> </body> </html> """ import re from bs4 import BeautifulSoup # 创建soup解析对象 soup = BeautifulSoup(html, 'html.parser') # 查找所有a标签并返回 print(soup.find_all("a")) # 查找前两条a标签并返回,只返回两条a标签 print(soup.find_all("a",limit=2)) # 按照标签属性以及属性值查找 print(soup.find_all("p",class_="nickname")) print(soup.find_all(id="infoq")) # 列表行书查找tag标签 print(soup.find_all(['b','a'])) # 正则表达式匹配id属性值 print(soup.find_all('a',id=re.compile(r'.\d'))) print(soup.find_all(id=True)) # True可以匹配任何值,下面代码会查找所有tag,并返回相应的tag名称 for tag in soup.find_all(True): print(tag.name,end=" ") # 输出所有以b开始的tag标签 for tag in soup.find_all(re.compile("^b")): print(tag.name) # 简化前写法 soup.find_all("a") # 简化后写法 soup("a")
html = """ <html> <head> <title>零基础学编程</title> </head> <body> <p class="intro"><b>i am autofelix</b></p> <p class="nickname">飞兔小哥</p> <a href="https://autofelix.blog.csdn.net" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" id="csdn">csdn主页</a> <a href="https://xie.infoq.cn/u/autofelix/publish" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" id="infoq">infoq主页</a> <a href="https://blog.51cto.com/autofelix" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" id="51cto">51cto主页</a> <p class="attention">跪求关注 一键三连</p> <p class="introduce"> <a href="https://www.cnblogs.com/autofelix" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" id="cnblogs">博客园主页</a> </p> </body> </html> """ import re from bs4 import BeautifulSoup # 创建soup解析对象 soup = BeautifulSoup(html, 'html.parser') # 查找第一个a并直接返回结果 print(soup.find('a')) # 查找title print(soup.find('intro')) # 匹配指定href属性的a标签 print(soup.find('a',href='https://autofelix.blog.csdn.net')) # 根据属性值正则匹配 print(soup.find(class_=re.compile('tro'))) # attrs参数值 print(soup.find(attrs={'class': 'introduce'})) # 使用 find 时,如果没有找到查询标签会返回 None,而 find_all 方法返回空列表 print(soup.find('aa')) print(soup.find_all('bb')) # 简化写法 print(soup.head.title) # 上面代码等价于 print(soup.find("head").find("title"))