BeautifulSoup将HTML文档解析成一个复杂的树形结构,理解如何在这个树中导航是提取数据的关键。本章将详细介绍如何遍历文档树的各个节点。
为了更好地理解导航方法,我们先定义一个示例HTML:
from bs4 import BeautifulSoup
html_doc = """
<html>
<head>
<title>示例页面</title>
</head>
<body>
<div id="container" class="main-content">
<h1>主标题</h1>
<p class="intro">这是一个介绍段落</p>
<div class="articles">
<article class="post">
<h2>文章标题1</h2>
<p>文章内容1</p>
</article>
<article class="post">
<h2>文章标题2</h2>
<p>文章内容2</p>
</article>
</div>
<ul class="nav">
<li>首页</li>
<li>关于我们</li>
<li>联系我们</li>
</ul>
</div>
</body>
</html>
"""
soup = BeautifulSoup(html_doc, 'lxml')
子节点是直接嵌套在当前节点内部的节点。
.contents 获取直接子节点列表# 获取body标签的所有直接子节点
body = soup.body
print("body的直接子节点数量:", len(body.contents))
print("body的直接子节点:")
for i, child in enumerate(body.contents, 1):
print(f"{i}: {repr(child)}")
# 获取div#container的直接子节点
container = soup.find('div', id='container')
print("\ncontainer的直接子节点:")
for child in container.contents:
if child != '\n': # 过滤换行符
print(child.name if hasattr(child, 'name') else repr(child))
.children 获取子节点生成器# .children返回一个生成器,更节省内存
container = soup.find('div', id='container')
print("container的子节点:")
for child in container.children:
if child != '\n':
print(f"节点类型: {type(child).__name__}, 内容: {repr(child)[:50]}...")
.descendants 获取所有后代节点# .descendants返回所有后代节点(递归)
container = soup.find('div', id='container')
print("container的后代节点数量(包括文本节点):", sum(1 for _ in container.descendants))
print("\ncontainer的前5个后代节点:")
count = 0
for descendant in container.descendants:
if count >= 5:
break
print(f"类型: {type(descendant).__name__}, 内容: {repr(descendant)[:40]}")
count += 1
父节点是直接包含当前节点的节点。
.parent 获取直接父节点# 获取h1标签的父节点
h1 = soup.h1
parent = h1.parent
print(f"h1的父节点: {parent.name if parent.name else type(parent)}")
print(f"父节点的属性: {parent.attrs}")
# 获取文本节点的父节点
intro_p = soup.find('p', class_='intro')
text_node = intro_p.string
print(f"\n文本节点的父节点: {text_node.parent.name}")
print(f"父节点的类名: {text_node.parent.get('class')}")
.parents 获取所有祖先节点# 获取h1标签的所有祖先节点
h1 = soup.h1
print("h1的所有祖先节点:")
for i, parent in enumerate(h1.parents, 1):
name = parent.name if parent.name else type(parent)
print(f"{i}. {name}")
if name == '[document]':
break
# 查找特定类型的祖先节点
article = soup.find('article')
print(f"\narticle的所有祖先节点:")
for parent in article.parents:
name = parent.name if parent.name else type(parent)
print(f"- {name}")
兄弟节点是共享同一个父节点的节点。
.next_sibling 和 .previous_sibling# 获取下一个兄弟节点
intro_p = soup.find('p', class_='intro')
next_sibling = intro_p.next_sibling
print(f"intro_p的下一个兄弟节点: {repr(next_sibling)}")
# 多次获取兄弟节点
print("\n从intro_p开始的所有后续兄弟节点:")
current = intro_p
while current.next_sibling:
current = current.next_sibling
print(f"类型: {type(current).__name__}, 内容: {repr(current)[:40]}")
# 获取上一个兄弟节点
articles_div = soup.find('div', class_='articles')
prev_sibling = articles_div.previous_sibling
print(f"\narticles_div的上一个兄弟节点: {repr(prev_sibling)}")
.next_siblings 和 .previous_siblings# 获取所有后续兄弟节点
intro_p = soup.find('p', class_='intro')
print("intro_p的所有后续兄弟节点:")
for sibling in intro_p.next_siblings:
if sibling != '\n': # 过滤换行符
print(f"- {sibling.name if hasattr(sibling, 'name') else repr(sibling)[:30]}")
# 获取所有前序兄弟节点
ul = soup.find('ul', class_='nav')
print("\nul的所有前序兄弟节点:")
for sibling in ul.previous_siblings:
if sibling != '\n':
print(f"- {sibling.name if hasattr(sibling, 'name') else repr(sibling)[:30]}")
按照文档的解析顺序导航节点。
.next_element 和 .previous_element# .next_element获取下一个被解析的节点(不一定是兄弟节点)
intro_p = soup.find('p', class_='intro')
next_element = intro_p.next_element
print(f"intro_p的下一个元素: {repr(next_element)}")
# 连续获取下一个元素
print("\n从intro_p开始的连续3个元素:")
current = intro_p
for i in range(3):
current = current.next_element
print(f"{i+1}. {repr(current)}")
# .previous_element获取上一个被解析的节点
articles_div = soup.find('div', class_='articles')
prev_element = articles_div.previous_element
print(f"\narticles_div的上一个元素: {repr(prev_element)}")
.next_elements 和 .previous_elements# 获取所有后续元素
h1 = soup.h1
print("h1之后的所有元素(前5个):")
count = 0
for element in h1.next_elements:
if count >= 5:
break
if element != '\n':
print(f"- {repr(element)[:40]}")
count += 1
# 获取所有前序元素
first_article = soup.find('article')
print(f"\n{first_article.name}之前的所有元素(前5个):")
count = 0
for element in first_article.previous_elements:
if count >= 5:
break
if element != '\n':
print(f"- {repr(element)[:40]}")
count += 1
下面是一个完整的示例,演示如何结合使用各种导航方法提取特定数据。
from bs4 import BeautifulSoup
# 创建更复杂的HTML文档
complex_html = """
<div id="content">
<header>
<h1>博客标题</h1>
<nav>
<ul>
<li><a href="#home">首页</a></li>
<li><a href="#about">关于</a></li>
<li><a href="#contact">联系</a></li>
</ul>
</nav>
</header>
<main>
<article id="post-1" class="post">
<h2>Python编程技巧</h2>
<p class="meta">作者: 张三 | 发布日期: 2024-01-15</p>
<div class="content">
<p>Python是一种强大的编程语言...</p>
<ul>
<li>列表项1</li>
<li>列表项2</li>
<li>列表项3</li>
</ul>
</div>
</article>
<article id="post-2" class="post">
<h2>BeautifulSoup教程</h2>
<p class="meta">作者: 李四 | 发布日期: 2024-01-16</p>
<div class="content">
<p>BeautifulSoup是优秀的HTML解析库...</p>
</div>
</article>
</main>
<aside>
<div class="sidebar">
<h3>热门文章</h3>
<ul>
<li><a href="#">Python基础</a></li>
<li><a href="#">数据科学</a></li>
<li><a href="#">机器学习</a></li>
</ul>
</div>
</aside>
<footer>
<p>版权所有 © 2024</p>
</footer>
</div>
"""
soup = BeautifulSoup(complex_html, 'lxml')
def extract_navigation_links():
"""提取导航菜单中的所有链接"""
nav = soup.find('nav')
links = []
if nav:
# 找到nav下的所有a标签
for a in nav.find_all('a'):
link_info = {
'text': a.text,
'href': a.get('href', ''),
'parent': a.parent.name if a.parent else None,
'grandparent': a.parent.parent.name if a.parent and a.parent.parent else None
}
links.append(link_info)
return links
def extract_article_hierarchy():
"""提取文章的结构层次"""
articles = []
for article in soup.find_all('article', class_='post'):
article_info = {
'id': article.get('id', ''),
'title': article.h2.text if article.h2 else '',
'author': '',
'date': '',
'paragraphs': [],
'list_items': []
}
# 获取作者和日期(从meta段落)
meta_p = article.find('p', class_='meta')
if meta_p:
text = meta_p.text
if '作者:' in text and '发布日期:' in text:
article_info['author'] = text.split('作者:')[1].split('|')[0].strip()
article_info['date'] = text.split('发布日期:')[1].strip()
# 获取内容段落
content_div = article.find('div', class_='content')
if content_div:
# 获取所有段落
for p in content_div.find_all('p'):
article_info['paragraphs'].append(p.text.strip())
# 获取所有列表项
for li in content_div.find_all('li'):
article_info['list_items'].append(li.text.strip())
articles.append(article_info)
return articles
def find_sibling_articles():
"""查找兄弟文章"""
first_article = soup.find('article', id='post-1')
sibling_info = {
'current': first_article.get('id', '') if first_article else '',
'next_sibling': '',
'previous_sibling': ''
}
if first_article:
# 查找下一个兄弟article
next_sib = first_article.find_next_sibling('article')
if next_sib:
sibling_info['next_sibling'] = next_sib.get('id', '')
# 查找上一个兄弟article
prev_sib = first_article.find_previous_sibling('article')
if prev_sib:
sibling_info['previous_sibling'] = prev_sib.get('id', '')
return sibling_info
# 执行提取
print("=== 导航链接 ===")
nav_links = extract_navigation_links()
for link in nav_links:
print(f"链接: {link['text']}, URL: {link['href']}, 父元素: {link['parent']}")
print("\n=== 文章层次 ===")
articles = extract_article_hierarchy()
for i, article in enumerate(articles, 1):
print(f"\n文章{i} - {article['title']}")
print(f"作者: {article['author']}, 日期: {article['date']}")
print(f"段落数: {len(article['paragraphs'])}")
print(f"列表项: {article['list_items'][:2]}...") # 只显示前两项
print("\n=== 兄弟文章关系 ===")
siblings = find_sibling_articles()
print(f"当前文章: {siblings['current']}")
print(f"下一篇文章: {siblings['next_sibling']}")
print(f"上一篇文章: {siblings['previous_sibling']}")
find_parent() 和 find_parents()# 查找特定的父节点
first_li = soup.find('li')
# 查找最近的div父节点
div_parent = first_li.find_parent('div')
print(f"第一个li的div父节点: {div_parent.get('id', '') if div_parent else '无'}")
# 查找所有article父节点
meta_p = soup.find('p', class_='meta')
article_parents = list(meta_p.find_parents('article'))
print(f"meta_p的article父节点数量: {len(article_parents)}")
find_next_sibling() 和 find_previous_sibling()# 查找特定类型的兄弟节点
first_article = soup.find('article')
# 查找下一个article兄弟
next_article = first_article.find_next_sibling('article')
print(f"第一个article的下一个article兄弟: {next_article.get('id', '') if next_article else '无'}")
# 查找特定class的兄弟节点
sidebar_div = soup.find('div', class_='sidebar')
prev_div = sidebar_div.find_previous_sibling('div')
print(f"sidebar_div的上一个div兄弟: {prev_div.get('class', '') if prev_div else '无'}")
find_next() 和 find_all_next()# 查找后续的特定节点
header = soup.find('header')
# 查找header之后的第一个article
next_article = header.find_next('article')
print(f"header之后的第一篇文章: {next_article.get('id', '') if next_article else '无'}")
# 查找header之后的所有h2
all_next_h2 = header.find_all_next('h2')
print(f"header之后的所有h2标题:")
for h2 in all_next_h2:
print(f"- {h2.text}")
find_previous() 和 find_all_previous()# 查找前面的特定节点
footer = soup.find('footer')
# 查找footer之前的第一个article
prev_article = footer.find_previous('article')
print(f"footer之前的最后一篇文章: {prev_article.get('id', '') if prev_article else '无'}")
# 查找footer之前的所有h2
all_prev_h2 = footer.find_all_previous('h2')
print(f"footer之前的所有h2标题:")
for h2 in all_prev_h2:
print(f"- {h2.text}")
def get_real_siblings(element):
"""获取非空白文本的兄弟节点"""
siblings = []
for sibling in element.next_siblings:
# 过滤空白文本节点
if isinstance(sibling, str) and sibling.strip() == '':
continue
siblings.append(sibling)
return siblings
# 使用示例
h1 = soup.h1
real_siblings = get_real_siblings(h1)
print(f"h1的非空白兄弟节点数量: {len(real_siblings)}")
def safe_navigation(element, *steps):
"""安全地执行链式导航"""
current = element
for step in steps:
if current is None:
return None
if step == 'parent':
current = current.parent
elif step == 'next_sibling':
current = current.next_sibling
elif step == 'previous_sibling':
current = current.previous_sibling
elif step.startswith('find_'):
# 处理find方法
method_name = step
current = getattr(current, method_name, lambda *args: None)()
return current
# 使用示例
first_a = soup.find('a')
result = safe_navigation(first_a, 'parent', 'parent', 'find_next_sibling')
print(f"导航结果: {result.name if result else '无'}")
def get_element_path(element):
"""获取元素在文档中的完整路径"""
path = []
current = element
while current and current.name:
path.insert(0, current.name)
current = current.parent
return ' > '.join(path)
# 使用示例
first_h2 = soup.find('h2')
path = get_element_path(first_h2)
print(f"h2的文档路径: {path}")