获取HTML标签的属性和文本内容是BeautifulSoup最核心的功能之一。本章将详细讲解如何从标签中提取属性值、文本内容,以及处理各种特殊情况,让你能够灵活地从HTML文档中提取所需数据。
为了演示各种获取方法,我们使用以下HTML文档作为示例:
from bs4 import BeautifulSoup
html_doc = """
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<title>示例页面 - 学习BeautifulSoup</title>
<meta charset="UTF-8">
<meta name="description" content="这是一个用于学习BeautifulSoup的示例页面">
<meta name="keywords" content="Python,BeautifulSoup,网页解析">
<link rel="stylesheet" href="/css/style.css" type="text/css">
</head>
<body>
<div id="header" class="page-header main-header" data-version="1.0">
<h1 class="title">欢迎学习BeautifulSoup</h1>
<p class="subtitle" data-show="true">掌握HTML解析技巧</p>
</div>
<nav class="main-nav" aria-label="主导航">
<ul>
<li><a href="/home" class="nav-link active" id="home-link">首页</a></li>
<li><a href="/tutorials" class="nav-link" id="tutorials-link">教程</a></li>
<li><a href="/examples" class="nav-link" id="examples-link">示例</a></li>
<li><a href="/contact" class="nav-link" id="contact-link" data-modal="true">联系</a></li>
</ul>
</nav>
<main id="content" class="main-content">
<article class="post featured" data-id="123" data-category="tutorial">
<h2 class="post-title">Python网页解析入门</h2>
<div class="post-meta">
<span class="author">作者: 张三</span>
<span class="date">发布日期: 2024-01-15</span>
<span class="read-time" data-minutes="5">阅读时间: 5分钟</span>
</div>
<div class="post-content">
<p>BeautifulSoup是Python中最流行的HTML解析库之一。</p>
<p>它提供了简单的方法来<strong>导航、搜索和修改</strong>解析树。</p>
<div class="code-block" data-language="python">
<pre><code>from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'lxml')</code></pre>
</div>
<p>学习BeautifulSoup可以帮助你轻松地从网页中提取数据。</p>
</div>
</article>
<aside class="sidebar">
<div class="widget" data-widget-type="category">
<h3>相关主题</h3>
<ul>
<li><a href="/python" class="topic-link">Python编程</a></li>
<li><a href="/web-scraping" class="topic-link">网页抓取</a></li>
<li><a href="/data-analysis" class="topic-link">数据分析</a></li>
</ul>
</div>
</aside>
</main>
<footer class="site-footer">
<p>版权所有 © 2024 - 学习BeautifulSoup</p>
<p class="contact-info">
联系方式:
<a href="mailto:contact@example.com" class="email-link">contact@example.com</a>
<a href="tel:+861234567890" class="phone-link">+86 1234567890</a>
</p>
</footer>
</body>
</html>
"""
soup = BeautifulSoup(html_doc, 'lxml')
# 获取h1标签的class属性
h1_tag = soup.find('h1')
if h1_tag:
print(f"h1标签的class属性: {h1_tag['class']}")
# 获取div的id属性
header_div = soup.find('div', id='header')
if header_div:
print(f"div的id属性: {header_div['id']}")
print(f"div的class属性: {header_div['class']}")
print(f"div的data-version属性: {header_div['data-version']}")
# 获取链接的href属性
first_link = soup.find('a')
if first_link:
print(f"第一个链接的href属性: {first_link['href']}")
print(f"第一个链接的class属性: {first_link['class']}")
# get()方法更安全,可以指定默认值
article = soup.find('article')
if article:
# 获取存在的属性
data_id = article.get('data-id')
print(f"文章的data-id: {data_id}")
# 获取不存在的属性,返回默认值
non_existent = article.get('data-author', '未设置作者')
print(f"文章的data-author: {non_existent}")
# 不指定默认值,返回None
none_value = article.get('data-none')
print(f"不存在的属性: {none_value}")
# 对比两种方式
link = soup.find('a', id='contact-link')
if link:
# 直接使用[]可能抛出KeyError
try:
modal = link['data-modal']
print(f"直接访问: {modal}")
except KeyError:
print("直接访问: 属性不存在")
# 使用get()更安全
modal_safe = link.get('data-modal', 'false')
print(f"使用get()访问: {modal_safe}")
# 获取标签的所有属性
meta_desc = soup.find('meta', attrs={'name': 'description'})
if meta_desc:
print(f"描述meta的所有属性: {meta_desc.attrs}")
# 遍历所有属性
print("\n属性详情:")
for attr_name, attr_value in meta_desc.attrs.items():
print(f" {attr_name}: {attr_value}")
# 检查属性是否存在
article = soup.find('article')
if article:
print(f"\n文章标签是否具有class属性: {'class' in article.attrs}")
print(f"文章标签是否具有data-id属性: {'data-id' in article.attrs}")
print(f"文章标签是否具有href属性: {'href' in article.attrs}")
# class属性是一个列表
header_div = soup.find('div', id='header')
if header_div:
classes = header_div.get('class', [])
print(f"header的class列表: {classes}")
print(f"class列表类型: {type(classes)}")
# 检查是否包含特定类
print(f"是否包含'page-header': {'page-header' in classes}")
print(f"是否包含'main-header': {'main-header' in classes}")
# 遍历所有类
print("\n所有class:")
for class_name in classes:
print(f" - {class_name}")
# 对于具有多个类的元素
post = soup.find('article', class_='post')
if post:
post_classes = post.get('class', [])
print(f"\n文章的class: {post_classes}")
# 检查是否同时包含多个类
has_both = all(cls in post_classes for cls in ['post', 'featured'])
print(f"是否同时包含'post'和'featured': {has_both}")
# 处理rel属性(另一个多值属性)
link_tag = soup.find('link')
if link_tag:
rel_value = link_tag.get('rel', [])
print(f"link的rel属性: {rel_value}")
# 如果是字符串,转换为列表
if isinstance(rel_value, str):
rel_list = rel_value.split()
print(f"转换后的rel列表: {rel_list}")
# 自定义多值属性处理函数
def get_multi_value_attr(element, attr_name):
"""安全获取多值属性,返回列表"""
value = element.get(attr_name, '')
if isinstance(value, list):
return value
elif isinstance(value, str):
# 按空格分割
return [v for v in value.split() if v]
else:
return []
# 使用示例
first_a = soup.find('a')
if first_a:
classes = get_multi_value_attr(first_a, 'class')
print(f"链接的class列表: {classes}")
# string属性适用于只有一个子节点的标签
title = soup.find('title')
if title:
print(f"页面标题(string): {title.string}")
print(f"类型: {type(title.string)}")
# 对于只有一个文本子节点的标签
h1 = soup.find('h1')
if h1:
print(f"h1内容(string): {h1.string}")
# 对于有多个子节点的标签,string返回None
post_content = soup.find('div', class_='post-content')
if post_content:
print(f"post-content的string: {post_content.string}") # None
# 获取文本节点的父标签
for string in soup.stripped_strings:
if 'BeautifulSoup' in string:
parent = string.parent
print(f"包含'BeautifulSoup'的文本: '{string}', 父标签: {parent.name}")
# text属性获取标签及其所有后代的文本
post = soup.find('article')
if post:
print(f"文章的所有文本(text):\n{post.text[:200]}...") # 只显示前200字符
# 获取特定元素的文本
post_title = soup.find('h2', class_='post-title')
if post_title:
print(f"文章标题(text): {post_title.text}")
# text vs string
example_html = "<p>这是<strong>加粗</strong>文本</p>"
example_soup = BeautifulSoup(example_html, 'lxml')
p_tag = example_soup.find('p')
print(f"\nstring属性: {p_tag.string}") # None,因为有多个子节点
print(f"text属性: {p_tag.text}") # '这是加粗文本'
# get_text()与text相似,但可以添加参数
post_content = soup.find('div', class_='post-content')
if post_content:
# 默认用法
default_text = post_content.get_text()
print(f"默认提取的文本(前100字符):\n{default_text[:100]}...")
# 指定分隔符
separated_text = post_content.get_text(separator=' | ')
print(f"\n使用分隔符的文本:\n{separated_text[:100]}...")
# 移除首尾空白
stripped_text = post_content.get_text(strip=True)
print(f"\n移除空白的文本(前100字符):\n{stripped_text[:100]}...")
# 限制文本长度
limited_text = post_content.get_text(separator=' ', strip=True)
if len(limited_text) > 100:
limited_text = limited_text[:100] + '...'
print(f"\n限制长度的文本:\n{limited_text}")
# 遍历所有段落并获取文本
paragraphs = soup.find_all('p')
print(f"\n页面中的段落数量: {len(paragraphs)}")
print("前3个段落的文本:")
for i, p in enumerate(paragraphs[:3], 1):
text = p.get_text(strip=True)
print(f"{i}. {text}")
# 获取data-*自定义属性
article = soup.find('article')
if article:
# 直接访问
data_id = article.get('data-id')
data_category = article.get('data-category')
print(f"文章的data-id: {data_id}")
print(f"文章的data-category: {data_category}")
# 获取所有data属性
print("\n文章的所有data属性:")
for attr_name, attr_value in article.attrs.items():
if attr_name.startswith('data-'):
print(f" {attr_name}: {attr_value}")
# 处理多个data属性
read_time_span = soup.find('span', class_='read-time')
if read_time_span:
minutes = read_time_span.get('data-minutes')
print(f"\n阅读时间: {read_time_span.text}")
print(f"数据属性data-minutes: {minutes}")
# 查找具有特定data属性的元素
modal_links = soup.find_all(attrs={'data-modal': True})
print(f"\n具有data-modal属性的链接数量: {len(modal_links)}")
# 获取所有链接的href
all_links = soup.find_all('a')
print("页面中所有链接:")
for i, link in enumerate(all_links, 1):
href = link.get('href', '无href')
text = link.get_text(strip=True) or '无文本'
print(f"{i}. 文本: '{text}', href: {href}")
# 获取外部资源链接
link_tag = soup.find('link')
if link_tag:
href = link.get('href', '')
rel = link.get('rel', [])
type_attr = link.get('type', '')
print(f"\nCSS链接: href={href}, rel={rel}, type={type_attr}")
# 处理相对URL和绝对URL
from urllib.parse import urljoin
base_url = 'https://example.com'
for link in soup.find_all('a', href=True):
href = link['href']
# 判断是否为相对URL
if href.startswith('/'):
absolute_url = urljoin(base_url, href)
print(f"相对URL: {href} -> 绝对URL: {absolute_url}")
else:
print(f"绝对URL: {href}")
# 获取图片src(虽然示例中没有img标签,但方法相同)
# img_tags = soup.find_all('img')
# for img in img_tags:
# src = img.get('src', '')
# alt = img.get('alt', '')
# print(f"图片: src={src}, alt={alt}")
# 危险的链式访问
try:
# 如果任何中间步骤返回None,就会抛出异常
dangerous = soup.find('nonexistent').find('a')['href']
print(dangerous)
except AttributeError as e:
print(f"链式访问错误: {e}")
# 安全的链式访问方法1:逐步检查
def safe_chain_access_1(soup):
first = soup.find('nav')
if first:
second = first.find('ul')
if second:
third = second.find('li')
if third:
fourth = third.find('a')
if fourth:
return fourth.get('href', '')
return None
# 安全的链式访问方法2:使用try-except
def safe_chain_access_2(soup):
try:
return soup.find('nav').find('ul').find('li').find('a')['href']
except (AttributeError, KeyError, TypeError):
return None
# 安全的链式访问方法3:使用get()和条件表达式
def safe_chain_access_3(soup):
nav = soup.find('nav')
ul = nav.find('ul') if nav else None
li = ul.find('li') if ul else None
a = li.find('a') if li else None
return a.get('href') if a else None
print(f"方法1结果: {safe_chain_access_1(soup)}")
print(f"方法2结果: {safe_chain_access_2(soup)}")
print(f"方法3结果: {safe_chain_access_3(soup)}")
class SafeSoupAccess:
"""BeautifulSoup安全访问工具类"""
@staticmethod
def get_attribute(element, attr_name, default=''):
"""安全获取元素属性"""
if element and hasattr(element, 'get'):
return element.get(attr_name, default)
return default
@staticmethod
def get_text(element, default='', strip=True):
"""安全获取文本内容"""
if element and hasattr(element, 'get_text'):
return element.get_text(strip=strip)
elif element and hasattr(element, 'text'):
text = element.text
return text.strip() if strip else text
return default
@staticmethod
def find_with_default(soup, *args, default=None, **kwargs):
"""安全查找元素,找不到时返回默认值"""
result = soup.find(*args, **kwargs)
return result if result is not None else default
@staticmethod
def chain_get(element, *attr_names, default=''):
"""链式获取属性"""
current = element
for attr_name in attr_names:
if current and hasattr(current, 'get'):
current = current.get(attr_name)
else:
return default
return current if current is not None else default
# 使用示例
safe = SafeSoupAccess()
# 安全获取属性
article = soup.find('article')
data_id = safe.get_attribute(article, 'data-id', '未知')
print(f"安全获取data-id: {data_id}")
# 安全获取文本
h1 = soup.find('h1')
h1_text = safe.get_text(h1, '无标题')
print(f"安全获取h1文本: {h1_text}")
# 链式获取
nav = soup.find('nav')
if nav:
# 尝试获取 nav -> 第一个a -> href
first_link_href = safe.chain_get(nav, 'a', 'href', default='#')
print(f"链式获取href: {first_link_href}")
def extract_all_links_info(soup):
"""提取页面中所有链接的信息"""
links_info = []
for a_tag in soup.find_all('a'):
link_info = {
'text': a_tag.get_text(strip=True),
'href': a_tag.get('href', ''),
'id': a_tag.get('id', ''),
'class': a_tag.get('class', []),
'title': a_tag.get('title', ''),
'target': a_tag.get('target', ''),
'rel': a_tag.get('rel', []),
'data_attrs': {}
}
# 提取所有data-*属性
for attr_name, attr_value in a_tag.attrs.items():
if attr_name.startswith('data-'):
link_info['data_attrs'][attr_name] = attr_value
# 判断链接类型
href = link_info['href']
if href.startswith('mailto:'):
link_info['type'] = 'email'
elif href.startswith('tel:'):
link_info['type'] = 'phone'
elif href.startswith('#'):
link_info['type'] = 'anchor'
elif href.startswith('http'):
link_info['type'] = 'external'
else:
link_info['type'] = 'internal'
links_info.append(link_info)
return links_info
# 使用示例
links_info = extract_all_links_info(soup)
print(f"提取到 {len(links_info)} 个链接")
print("\n前3个链接信息:")
for i, link in enumerate(links_info[:3], 1):
print(f"{i}. 文本: '{link['text']}', href: {link['href']}, 类型: {link['type']}")
# 统计链接类型
from collections import Counter
link_types = Counter(link['type'] for link in links_info)
print(f"\n链接类型统计: {dict(link_types)}")
def extract_article_data(soup):
"""从文章元素提取结构化数据"""
article = soup.find('article')
if not article:
return None
data = {
'title': '',
'author': '',
'date': '',
'read_time': '',
'category': '',
'content': '',
'paragraphs': [],
'code_blocks': []
}
# 提取标题
title_elem = article.find('h2', class_='post-title')
if title_elem:
data['title'] = title_elem.get_text(strip=True)
# 提取作者
author_elem = article.find('span', class_='author')
if author_elem:
data['author'] = author_elem.get_text(strip=True).replace('作者:', '').strip()
# 提取日期
date_elem = article.find('span', class_='date')
if date_elem:
data['date'] = date_elem.get_text(strip=True).replace('发布日期:', '').strip()
# 提取阅读时间
read_time_elem = article.find('span', class_='read-time')
if read_time_elem:
data['read_time'] = read_time_elem.get_text(strip=True)
# 从data属性获取分钟数
minutes = read_time_elem.get('data-minutes')
if minutes:
data['read_time_minutes'] = int(minutes)
# 提取分类
data['category'] = article.get('data-category', '')
# 提取内容
content_elem = article.find('div', class_='post-content')
if content_elem:
# 完整内容
data['content'] = content_elem.get_text(separator=' ', strip=True)
# 段落
paragraphs = content_elem.find_all('p')
data['paragraphs'] = [p.get_text(strip=True) for p in paragraphs]
# 代码块
code_blocks = content_elem.find_all('div', class_='code-block')
for code_block in code_blocks:
code_data = {
'language': code_block.get('data-language', ''),
'code': code_block.get_text(strip=True)
}
data['code_blocks'].append(code_data)
# 提取文章属性
data['id'] = article.get('data-id', '')
data['classes'] = article.get('class', [])
return data
# 使用示例
article_data = extract_article_data(soup)
if article_data:
print("=== 文章数据 ===")
print(f"标题: {article_data['title']}")
print(f"作者: {article_data['author']}")
print(f"日期: {article_data['date']}")
print(f"阅读时间: {article_data['read_time']}")
print(f"分类: {article_data['category']}")
print(f"段落数量: {len(article_data['paragraphs'])}")
print(f"代码块数量: {len(article_data['code_blocks'])}")
if article_data['paragraphs']:
print("\n前2个段落:")
for i, para in enumerate(article_data['paragraphs'][:2], 1):
print(f"{i}. {para[:50]}...")
def extract_attributes_by_selector(soup, selector, attr_names, default=''):
"""根据CSS选择器批量提取属性"""
results = []
elements = soup.select(selector)
for element in elements:
attr_values = {}
for attr_name in attr_names:
attr_values[attr_name] = element.get(attr_name, default)
results.append(attr_values)
return results
# 使用示例:提取所有链接的href和文本
links_data = extract_attributes_by_selector(
soup,
'a',
['href', 'text', 'class', 'id'],
default=''
)
print(f"提取到 {len(links_data)} 个链接的属性")
for i, link_data in enumerate(links_data[:3], 1):
print(f"{i}. href: {link_data.get('href', '无')}, 文本: {link_data.get('text', '无')}")
# 提取所有meta标签的属性
meta_data = extract_attributes_by_selector(
soup,
'meta',
['name', 'content', 'charset', 'property'],
default=''
)
print(f"\n提取到 {len(meta_data)} 个meta标签")
for meta in meta_data:
if meta.get('name'):
print(f"name: {meta['name']}, content: {meta['content'][:30]}...")
def extract_and_analyze_text(soup, selector=None, strip=True, min_length=0):
"""提取文本并进行分析"""
if selector:
elements = soup.select(selector)
texts = [elem.get_text(strip=strip) for elem in elements]
else:
# 如果没有指定选择器,提取所有文本
texts = [text for text in soup.stripped_strings]
# 过滤太短的文本
texts = [text for text in texts if len(text) >= min_length]
# 统计分析
analysis = {
'total_texts': len(texts),
'total_characters': sum(len(text) for text in texts),
'avg_length': sum(len(text) for text in texts) / len(texts) if texts else 0,
'longest': max(texts, key=len) if texts else '',
'shortest': min(texts, key=len) if texts else '',
'word_count': sum(len(text.split()) for text in texts),
'texts_by_length': {}
}
# 按长度分组
for text in texts:
length = len(text)
if length in analysis['texts_by_length']:
analysis['texts_by_length'][length] += 1
else:
analysis['texts_by_length'][length] = 1
return texts, analysis
# 使用示例:分析段落文本
paragraphs_texts, para_analysis = extract_and_analyze_text(
soup,
selector='p',
strip=True,
min_length=10
)
print("=== 段落文本分析 ===")
print(f"段落数量: {para_analysis['total_texts']}")
print(f"总字符数: {para_analysis['total_characters']}")
print(f"平均长度: {para_analysis['avg_length']:.1f} 字符")
print(f"最长段落: {para_analysis['longest'][:50]}...")
print(f"总单词数: {para_analysis['word_count']}")
# 分析标题文本
headings_texts, headings_analysis = extract_and_analyze_text(
soup,
selector='h1, h2, h3',
strip=True
)
print("\n=== 标题文本分析 ===")
print(f"标题数量: {headings_analysis['total_texts']}")
print("所有标题:")
for text in headings_texts:
print(f" - {text}")
# 方法1:使用get()方法并指定默认值
element = soup.find('div')
attr_value = element.get('data-custom', '默认值')
# 方法2:使用条件表达式
attr_value = element['data-custom'] if element and 'data-custom' in element.attrs else '默认值'
# 方法3:使用try-except
try:
attr_value = element['data-custom']
except (KeyError, TypeError):
attr_value = '默认值'
# 方法4:封装成函数
def get_attribute_safe(element, attr_name, default=''):
if element and hasattr(element, 'attrs') and attr_name in element.attrs:
return element[attr_name]
return default
def extract_clean_text(soup):
"""提取干净的文本,排除脚本、样式等"""
# 创建副本以避免修改原始soup
soup_copy = BeautifulSoup(str(soup), 'lxml')
# 移除不需要的元素
for element in soup_copy(['script', 'style', 'meta', 'link', 'noscript']):
element.decompose()
# 获取文本
text = soup_copy.get_text(separator=' ', strip=True)
# 清理多余的空白
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split())
text = ' '.join(chunk for chunk in chunks if chunk)
return text
# 使用示例
clean_text = extract_clean_text(soup)
print(f"干净文本前200字符:\n{clean_text[:200]}...")
# 示例:从复杂嵌套结构中提取特定部分的文本
def extract_nested_text(soup):
"""从嵌套结构中提取结构化文本"""
result = {}
# 提取主内容
main = soup.find('main')
if main:
# 提取文章标题
article_title = main.find('h2', class_='post-title')
result['article_title'] = article_title.get_text(strip=True) if article_title else ''
# 提取文章内容(排除代码块)
post_content = main.find('div', class_='post-content')
if post_content:
# 复制内容以便修改
content_copy = BeautifulSoup(str(post_content), 'lxml')
# 移除代码块
for code_block in content_copy.find_all('div', class_='code-block'):
code_block.decompose()
# 提取文本
result['article_content'] = content_copy.get_text(separator=' ', strip=True)
# 单独提取代码块
code_blocks = post_content.find_all('div', class_='code-block')
result['code_blocks'] = [
{
'language': cb.get('data-language', ''),
'code': cb.get_text(strip=True)
}
for cb in code_blocks
]
# 提取侧边栏文本
sidebar = soup.find('aside', class_='sidebar')
if sidebar:
# 提取侧边栏所有链接文本
sidebar_links = sidebar.find_all('a')
result['sidebar_links'] = [
{
'text': link.get_text(strip=True),
'href': link.get('href', '')
}
for link in sidebar_links
]
return result
# 使用示例
nested_data = extract_nested_text(soup)
if nested_data:
print(f"文章标题: {nested_data.get('article_title', '无标题')}")
print(f"文章内容长度: {len(nested_data.get('article_content', ''))} 字符")
print(f"代码块数量: {len(nested_data.get('code_blocks', []))}")
print(f"侧边栏链接数量: {len(nested_data.get('sidebar_links', []))}")
| 场景 | 推荐方法 | 说明 |
|---|---|---|
| 获取属性值 | .get(attr_name, default) |
最安全,可指定默认值 |
| 获取单个文本节点 | .string |
仅当标签只有一个文本子节点时使用 |
| 获取所有文本 | .get_text(strip=True) |
获取标签及其后代的所有文本,可清理空白 |
| 获取class属性 | .get('class', []) |
返回列表,可进行成员检查 |
| 检查属性是否存在 | attr_name in element.attrs |
使用in操作符检查 |
| 获取所有属性 | .attrs |
返回字典,包含所有属性 |
| 链式访问 | 逐步检查或使用安全函数 | 避免None导致的AttributeError |
get()方法或检查属性是否存在string、text或get_text()find()和find_all()定位目标元素