selenium案例

继续上一篇文章

使用Chrome的检查定位评论位置

image-20211219224320568

我们可以提取评论数据

from selenium import webdriver
driver = webdriver.Firefox(executable_path="D:\GeckoDriver\geckodriver")
# driver.get("http://www.santostang.com/2018/07/04/hello-world/")
comment = driver.find_element_by_css_selector('div.reply-content').find_element_by_tag_name('P')
print(comment.text)

但是报错了selenium.common.exceptions.NoSuchElementException: Message: Unable to locate element: div.reply-content

使用driver.page_source审查,封装在了iframe框架里面

#加入iframe解析
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.service import Service
s = Service(r"D:\\GeckoDriver\\geckodriver.exe")
#driver = webdriver.Firefox(executable_path="D:\GeckoDriver\geckodriver")
driver = webdriver.Firefox(service=s)
driver.get("http://www.santostang.com/2018/07/04/hello-world/")
driver.switch_to.frame(driver.find_element(By.CSS_SELECTOR, "iframe[title='livere-comment']"))
comment = driver.find_element(By.CSS_SELECTOR, 'div.reply-content').find_element(By.TAG_NAME, 'P')
print(comment.text)

就可以得到评论内容了

获取所有评论

模拟点击加载评论

通过观察button标签,确定它

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.service import Service
import time
s = Service(r"D:\\GeckoDriver\\geckodriver.exe")
#driver = webdriver.Firefox(executable_path="D:\GeckoDriver\geckodriver")
driver = webdriver.Firefox(service=s)
driver.get("http://www.santostang.com/2018/07/04/hello-world/")
time.sleep(5)
for i in range(3):
    #下滑到页面底部
    #driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    #解析frame,找到翻看评论,做出点击操作
    driver.switch_to.frame(driver.find_element(By.CSS_SELECTOR, "iframe[title='livere-comment']"))
    load_more = driver.find_element(By.CSS_SELECTOR, "div.list-reduce button")
    load_more.click()
    #转回iframe
    driver.switch_to.default_content()
    time.sleep(2)
driver.switch_to.frame(driver.find_element(By.CSS_SELECTOR, "iframe[title='livere-comment']"))
comments = driver.find_elements(By.CSS_SELECTOR, "div.reply-content")
for comment in comments:
    content = comment.find_element(By.TAG_NAME, 'p')
    print(content.text)

selenium实现自动操作网页:

  • clear:清除元素内容
  • send_keys 模拟键盘输入
  • click 点击元素
  • submit 提交表单

Selenium的高级操作

使用selenium控制浏览器加载的内容,从而加载爬取速度

  • 控制CSS的加载
  • 控制图片文件的加载
  • 控制js的执行
CSS控制
限制图片的加载
from selenium import webdriver
from selenium.webdriver.firefox.service import Service

s = Service(r"D:\\GeckoDriver\\geckodriver.exe")
options = webdriver.FirefoxOptions()
options.set_preference('permissions.default.image',2)
driver = webdriver.Firefox(service=s, options=options)
driver.get("http://www.santostang.com/2018/07/04/hello-world/")

image-20211225163004673

限制javascript的执行
from selenium import webdriver
from selenium.webdriver.firefox.service import Service

s = Service(r"D:\\GeckoDriver\\geckodriver.exe")
options = webdriver.FirefoxOptions()
# options.set_preference('permissions.default.image',2)
options.set_preference("javascript.enabled", False)
driver = webdriver.Firefox(service=s, options=options)
driver.get("http://www.santostang.com/2018/07/04/hello-world/")

image-20211225163311988

实践

得到某房源的名称,价格,评价数量,房屋类型,床数量,访客数量

使用F12检查功能

得到数据

开始抓取
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
import time

s = Service(r"D:\\GeckoDriver\\geckodriver.exe")
driver = webdriver.Firefox(service=s)
driver.get("https://www.airbnb.cn/s/Shenzhen--China?page=1")
time.sleep(5)
rent_list = driver.find_elements(By.CSS_SELECTOR, 'div._zhz0z8')
time.sleep(2)
for eachhouse in rent_list:
    name = eachhouse.find_element(By.CSS_SELECTOR, 'div._qrfr9x5')
    name = name.text
    price = eachhouse.find_element(By.CSS_SELECTOR, 'span._1d8yint7 :nth-child(2)')
    price = price.text
    comment = eachhouse.find_element(By.CSS_SELECTOR, 'span._69pvqtq')
    comment = comment.text
    house_type = eachhouse.find_element(By.CSS_SELECTOR, "span[style='color: rgb(118, 118, 118);']")
    house = house_type.text
    house_type = house[:7]
    bedroom = house[12:]
    print("---------------------------------")
    print("房源名称:", name)
    print("价格:", price)
    print("评价数量:", comment)
    print("房间类型:", house_type)
    print("房间布局", bedroom)
    print("---------------------------------")
    time.sleep(3)

image-20211225175429389

爬取多个页面的数据,通过观察url很明显可以发现page参数可变

解析网页

使用正则表达式解析网页
正则的使用方法
  • re.match()

    re.match(pattern, string, flags=0) => 其中的flags是控制其匹配方式(是否区分大小写,多行匹配)

    import re
    m = re.match('www', 'www.baidu.com')
    print(m)
    print(m.span())
    print(m.start())
    print(m.end())
    # <re.Match object; span=(0, 3), match='www'> 
    # (0, 3)                                      
    # 0                                           
    # 3                                           

    可以使用正则的方式匹配

    一般会在正则表达式前加上修饰字符rr'\d',这里的r是raw string的意思,不会对引号里面的\进行特殊处理

  • re.search()

    re.match()只能从字符串的起始位置开始匹配,re.search()能够扫描整个字符串,并返回第一个成功的匹配

    import re
    m = re.match('com', 'www.baidu.com')
    n = re.search('com', 'www.baidu.com')
    print(m)
    print(n)
    # None
    # <re.Match object; span=(10, 13), match='com'>
  • re.findall()

    re.match()和re.search()只能找到一个匹配,就会停止匹配,但是re.findall()会找出所有满足的匹配项

    import re
    str = '1234 is the first number and 5678 is the second number!'
    m_match = re.match('[0-9]+', '1234 is the first number and 5678 is the second number!')
    m_search = re.search('[0-9]+', str)
    m_findall = re.findall('[0-9]+', str)
    print(m_match.group())
    print(m_search.group())
    print(m_findall)
    # 1234
    # 1234
    # ['1234', '5678']
使用bs4解析网页

上一篇文章已经讲了

遍历文档树
  • 利用contents输出子节点 soup.header.div.contents
  • 利用children获取子标签 soup.header.div.children
  • 利用descendants获取子子孙孙节点 soup.header.div.descendants
  • 利用parent获取父节点 soup.header.div.a.parent
搜索文档树
  • find方法
  • find_all方法

上一篇文章已经使用过了

这里可以配合正则进行使用

soup.find_all(re.compile("^h")) => 匹配所有以h开头的标签

CSS选择器
  • 通过标签逐层寻找 soup.select("header h3")
  • 通过某个标签的子标签遍历 soup.select("div > a"),返回div标签下的所有a标签
  • 实现搜索文档的功能 soup.select("a[href^='www.baidu.com']") => 找到所有以www.baidu.com开头的a标签
利用lxml解析网页

获取所有标题

import requests
from lxml import etree

url = "http://www.santostang.com/"
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
res = requests.get(url, headers=headers)
html = etree.HTML(res.text)
title_list = html.xpath("//h1[@class='post-title']/a/text()")
print(title_list)
#['第四章 – 4.3 通过selenium 模拟浏览器抓取', '第四章 – 4.2 解析真实地址抓取', '第四章- 动态网页抓取 (解析真实地址 + selenium)', 'Hello world!']
  • XPath的选取方法
表达式 描述
nodename 选取此节点的所有子节点
/ 从根节点选取
// 不考虑所选节点的位置
. 选取当前节点
.. 选取父节点
@ 选取属性

实践

目标:

获取二手房源名称,价格,几房几厅,大小,建造年份,联系人,地址,标签。

使用F12的检查功能
抓取第一页数据
#!/usr/bin/python
# coding: UTF-8

import requests
from bs4 import BeautifulSoup
import time
url = "https://beijing.anjuke.com/sale/p1/"
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}

respond = requests.get(url, headers=headers)
soup = BeautifulSoup(respond.text, 'lxml')
house_list = soup.find_all('div', class_='property')
for house in house_list:
    name = house.find('h3', class_='property-content-title-name').text.strip()
    price = house.find('p', class_='property-price-total').text.strip()
    house_type = house.find('p', class_='property-content-info-text property-content-info-attribute').text.strip()
    size = house.find('div', class_='property-content-info').contents[2].text.strip()
    date = house.find('div', class_='property-content-info').contents[8].text.strip()
    connection_person = house.find('div', class_='property-extra').contents[2].text
    address = house.find('div', class_='property-content-info property-content-info-comm').text.strip()
    print("--------------------------------")
    print("名称:", name)
    print("价格:", price)
    print("几房几厅:", house_type)
    print("大小:", size)
    print("建造日期:", date)
    print("联系人:", connection_person)
    print("地址:", address)
    print("--------------------------------")
    time.sleep(2)

image-20211225203549210

抓取前十页的数据

观察url的不同,可以通过for循环再加上前面的程序就可以实现抓取前十页的内容了

数据存储

基本存储
存储进入TXT文本
title = "这是一个标题"
with open('test.txt', 'w') as f:
    f.write(title)
    f.close()

打开文件的方式:

读写方式 可否读写 若文件不存在 写入方式
w 写入 创建 覆盖写入
w+ 读取+写入 创建 覆盖写入
r 读取 报错 不可写入
r+ 读取+写入 报错 覆盖写入
a 写入 创建 附加写入
a+ 读取+写入 创建 附加写入

join方法:

title = '\t'.join(['name', 'title', 'age'])
with open('test.txt', 'w') as f:
    f.write(title)
    f.close()
存储进入CSV

CSV是一种以逗号分隔值的文件类型,既可以使用记事本打开,也可以是用Excel打开

读取CSV文件的方法:

import csv
with open('test.csv', 'r', encoding='UTF-8') as f:
    csv_reader = csv.reader(f)
    for row in csv_reader:
        print(row)
        
# 将每一行转换为列表
# 每个元素是一个字符串

写入CSV文件的方法:

import csv
output_list = [1,2,3,4]
with open('test.csv', 'a+', encoding='UTF-8') as f:
    csv_writer = csv.writer(f)
    csv_writer.writerow(output_list)

image-20211225210640901

存储到mysql数据库
  • 建立scraping

  • 创建urls

    表结构:

    create table urls(id int not null auto_increment, url varchar(1000) not null, content varchar(4000) not null, created_time timestamp default current_timestamp, primary key (id));

python操控mysql
from pymysql.cursors import Cursor

db = pymysql.connect(host='localhost', user='root', passwd='root', database='scraping')
cur = db.cursor()
cur.execute('select version()')
data = cur.fetchone()
print(data)
db.close()

可以得到mysql版本号

插入数据:

import pymysql
from pymysql.cursors import Cursor

db = pymysql.connect(host='localhost', user='root', passwd='root', database='scraping')
cur = db.cursor()
cur.execute("insert into urls (url, content) values('www.baidu.com', 'This is the content.')")
cur.close()
db.commit()
db.close()

image-20211225215407861

成功插入数据

我们可以考虑将之前抓取到的title和对应的url存入数据库中

import pymysql
from bs4 import BeautifulSoup
import requests

db = pymysql.connect(host='localhost', user='root', passwd='root', database='scraping', charset='utf8')
cur = db.cursor()

url = "http://www.santostang.com/"
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
res = requests.get(url, headers=headers)
soup = BeautifulSoup(res.text, 'lxml')
title_list = soup.find_all('h1', class_='post-title')
for eachtitle in title_list:
    url = eachtitle.a['href']
    title = eachtitle.a.text.strip()
    cur.execute("insert into urls (url, content) values (%s, %s)",(url, title))

cur.close()
db.commit()
db.close()

image-20211225221927663

成功插入数据库中

:如果mysql查询中文乱码时:

show variables like 'character%';查看字符编码

set names gbk;设置字符编码格式

存储到mongodb数据库
python操控mongodb

下载pymongopip3 install pymongo

将之前抓取的数据存入:

import pymongo
import requests
from bs4 import BeautifulSoup
import time

client = pymongo.MongoClient("mongodb://localhost/")
db = client.scraping
collection = db.blog

url = "http://www.santostang.com/"
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
res = requests.get(url, headers=headers)
soup = BeautifulSoup(res.text, 'lxml')
title_list = soup.find_all('h1', class_='post-title')
for eachtitle in title_list:
    url = eachtitle.a['href']
    title = eachtitle.a.text.strip()
    post = {
        "url":url,
        "title":title,
        "date":time.strftime("%Y-%m-%d %X", time.localtime())
    }
    collection.insert_one(post)
    time.sleep(2)

image-20211226165119270

成功保存入mongodb数据库