from bs4 import BeautifulSoup
soup = BeautifulSoup('<p>Hello</p>', 'lxml')
print(soup.p.string)
结果:
Hello
beautiful soup美化的效果实例:
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="example.com/elsie" rel="external nofollow" rel="external nofollow" rel="external nofollow" class="sister" id="link1"><!-- Elsie --></a>,
<a href="example.com/lacie" rel="external nofollow" rel="external nofollow" rel="external nofollow" class="sister" id="link2">Lacie</a> and
<a href="example.com/tillie" rel="external nofollow" rel="external nofollow" rel="external nofollow" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')#调用prettify()方法。这个方法可以把要解析的字符串以标准的缩进格式输出
print(soup.prettify())
print(soup.title.string)
结果:
<html>
<head>
<title>
The Dormouse's story
</title>
</head>
<body>
<p class="title" name="dromouse">
<b>
The Dormouse's story
</b>
</p>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a class="sister" href="example.com/elsie" rel="external nofollow" rel="external nofollow" rel="external nofollow" id="link1">
<!-- Elsie -->
</a>
,
<a class="sister" href="example.com/lacie" rel="external nofollow" rel="external nofollow" rel="external nofollow" id="link2">
Lacie
</a>
and
<a class="sister" href="example.com/tillie" rel="external nofollow" rel="external nofollow" rel="external nofollow" id="link3">
Tillie
</a>
;
and they lived at the bottom of a well.
</p>
<p class="story">
...
</p>
</body>
</html>
The Dormouse's story
下面举例说明选择元素、属性、名称的方法
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="example.com/elsie" rel="external nofollow" rel="external nofollow" rel="external nofollow" class="sister" id="link1"><!-- Elsie --></a>,
<a href="example.com/lacie" rel="external nofollow" rel="external nofollow" rel="external nofollow" class="sister" id="link2">Lacie</a> and
<a href="example.com/tillie" rel="external nofollow" rel="external nofollow" rel="external nofollow" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print('输出结果为title节点加里面的文字内容:\n',soup.title)
print('输出它的类型:\n',type(soup.title))
print('输出节点的文本内容:\n',soup.title.string)
print('结果是节点加其内部的所有内容:\n',soup.head)
print('结果是第一个p节点的内容:\n',soup.p)
print('利用name属性获取节点的名称:\n',soup.title.name)
#这里需要注意的是,有的返回结果是字符串,有的返回结果是字符串组成的列表。
# 比如,name属性的值是唯一的,返回的结果就是单个字符串。
# 而对于class,一个节点元素可能有多个class,所以返回的是列表。
print('每个节点可能有多个属性,比如id和class等:\n',soup.p.attrs)
print('选择这个节点元素后,可以调用attrs获取所有属性:\n',soup.p.attrs['name'])
print('获取p标签的name属性值:\n',soup.p['name'])
print('获取p标签的class属性值:\n',soup.p['class'])
print('获取第一个p节点的文本:\n',soup.p.string)
结果:
输出结果为title节点加里面的文字内容:
<title>The Dormouse's story</title>
输出它的类型:
<class 'bs4.element.Tag'>
输出节点的文本内容:
The Dormouse's story
结果是节点加其内部的所有内容:
<head><title>The Dormouse's story</title></head>
结果是第一个p节点的内容:
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
利用name属性获取节点的名称:
title
每个节点可能有多个属性,比如id和class等:
{'class': ['title'], 'name': 'dromouse'}
选择这个节点元素后,可以调用attrs获取所有属性:
dromouse
获取p标签的name属性值:
dromouse
获取p标签的class属性值:
['title']
获取第一个p节点的文本:
The Dormouse's story
from bs4 import BeautifulSoup
soup = BeautifulSoup('<p>Hello</p>', 'lxml')
print(soup.p.string)
结果:
Hello
beautiful soup美化的效果实例:
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="example.com/elsie" rel="external nofollow" rel="external nofollow" rel="external nofollow" class="sister" id="link1"><!-- Elsie --></a>,
<a href="example.com/lacie" rel="external nofollow" rel="external nofollow" rel="external nofollow" class="sister" id="link2">Lacie</a> and
<a href="example.com/tillie" rel="external nofollow" rel="external nofollow" rel="external nofollow" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')#调用prettify()方法。这个方法可以把要解析的字符串以标准的缩进格式输出
print(soup.prettify())
print(soup.title.string)
结果:
<html>
<head>
<title>
The Dormouse's story
</title>
</head>
<body>
<p class="title" name="dromouse">
<b>
The Dormouse's story
</b>
</p>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a class="sister" href="example.com/elsie" rel="external nofollow" rel="external nofollow" rel="external nofollow" id="link1">
<!-- Elsie -->
</a>
,
<a class="sister" href="example.com/lacie" rel="external nofollow" rel="external nofollow" rel="external nofollow" id="link2">
Lacie
</a>
and
<a class="sister" href="example.com/tillie" rel="external nofollow" rel="external nofollow" rel="external nofollow" id="link3">
Tillie
</a>
;
and they lived at the bottom of a well.
</p>
<p class="story">
...
</p>
</body>
</html>
The Dormouse's story
下面举例说明选择元素、属性、名称的方法
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="example.com/elsie" rel="external nofollow" rel="external nofollow" rel="external nofollow" class="sister" id="link1"><!-- Elsie --></a>,
<a href="example.com/lacie" rel="external nofollow" rel="external nofollow" rel="external nofollow" class="sister" id="link2">Lacie</a> and
<a href="example.com/tillie" rel="external nofollow" rel="external nofollow" rel="external nofollow" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print('输出结果为title节点加里面的文字内容:\n',soup.title)
print('输出它的类型:\n',type(soup.title))
print('输出节点的文本内容:\n',soup.title.string)
print('结果是节点加其内部的所有内容:\n',soup.head)
print('结果是第一个p节点的内容:\n',soup.p)
print('利用name属性获取节点的名称:\n',soup.title.name)
#这里需要注意的是,有的返回结果是字符串,有的返回结果是字符串组成的列表。
# 比如,name属性的值是唯一的,返回的结果就是单个字符串。
# 而对于class,一个节点元素可能有多个class,所以返回的是列表。
print('每个节点可能有多个属性,比如id和class等:\n',soup.p.attrs)
print('选择这个节点元素后,可以调用attrs获取所有属性:\n',soup.p.attrs['name'])
print('获取p标签的name属性值:\n',soup.p['name'])
print('获取p标签的class属性值:\n',soup.p['class'])
print('获取第一个p节点的文本:\n',soup.p.string)
结果:
输出结果为title节点加里面的文字内容:
<title>The Dormouse's story</title>
输出它的类型:
<class 'bs4.element.Tag'>
输出节点的文本内容:
The Dormouse's story
结果是节点加其内部的所有内容:
<head><title>The Dormouse's story</title></head>
结果是第一个p节点的内容:
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
利用name属性获取节点的名称:
title
每个节点可能有多个属性,比如id和class等:
{'class': ['title'], 'name': 'dromouse'}
选择这个节点元素后,可以调用attrs获取所有属性:
dromouse
获取p标签的name属性值:
dromouse
获取p标签的class属性值:
['title']
获取第一个p节点的文本:
The Dormouse's story