[Python] 一键爬取TTB网站美图

作者: zhouxinhuagg 发布时间： 2019-12-26 4.22K 人阅读

初学Python3，看到一个美图网站，顺手写了一个脚本，自动爬取网站套图存取MySQL数据库中。

准备工作：
1、MySQL建表，共计3个：

存套图集信息

存套图集下的套图信息

存套图下的图片信息

第一个表：ttb_atlas

ID	int	主键，自增
AtlasName	varchar	套图集名称
AtlasUrl	varchar	套图集链接地址

第二个表：ttb_album

ID	int	主键，自增
AtlasName	varchar	套图集名称
AlbumName	varchar	套图名称
AlbumUrl	varchar	套图链接地址
AlbumPicUrl	varchar	套图预览照片链接
AlbumPicDate	date	套图日期

第三个表：ttb_photo

ID	int	主键，自增
AlbumName	varchar	套图名称
PhotoName	varchar	图片名称
PhotoUrl	varchar	图片链接地址

2、创建主函数程序ttb_lk.py

#!/usr/local/Cellar/python/3.7.1/bin
# -*- coding: UTF-8 -*-
import sys
sys.path.append("/Python")
import conf.ttb_manage as myconf
import conf.mysql_db as mysqldb
 
home_url = 'https://www.192td.com'
def main():
    #第一步：将首页套图集索引插入数据库
    myconf.get_home_atlas(home_url)
 
    #第二步：获取每个套图集的每套图信息，包括：套图名称、套图链接、套图页数
    db = mysqldb.Database()
    sql = 'select * from ttb_atlas'
    results = db.fetch_all(sql)
    for row in results:
        myconf.get_atlas_album(row['AtlasName'], row['AtlasUrl'])
        print(row)
    db.close()
 
    #第三步：获取每套图的图片信息，包括：图片名称、图片链接
    db = mysqldb.Database()
    sql = 'select * from ttb_album'
    results = db.fetch_all(sql)
    for row in results:
 myconf.get_album_photo(row['AlbumName'], row['AlbumUrl'])
    db.close()
 
if __name__ == '__main__':
    main()

3、创建程序ttb_manage.py

#!/usr/local/Cellar/python/3.7.1/bin
# -*- coding: UTF-8 -*-
import sys,requests,re,time,threading
from bs4 import BeautifulSoup
sys.path.append("/Python")
import conf.mysql_db as mysqldb
 
#======================================================
#获取首页套图集信息
def get_home_atlas(home_url):
    # html = open("ttb.html", "r").read()
    html = get_html(home_url)
    soup = BeautifulSoup(html, "lxml")
    db = mysqldb.Database()
    try:
        for ul in soup.find_all(class_ = 'childnav'):
            for li in ul:
sql = "insert into ttb_atlas(AtlasName,AtlasUrl) values('%s','%s')"%(li.string,li.a['href'])
                db.execute(sql)
except Exception:
        print(Exception)
    db.close()
    return True
 
#获取套图集下的套图信息:1
def get_atlas_album(AtlasName,AtlasUrl):
    # 第一步：获取套图的信息：套图页数、每页套图链接
    html = get_html(AtlasUrl)
    soup = BeautifulSoup(html, "lxml")
    #获取套图集页数
    link = soup.find('a',string='尾页').get('href')
    pages = int(re.findall('_(\d+).html',link,re.S)[0])
    #获取套图集每页的链接
    for page in range(pages,0,-1):
        if page == 1 :
            Page_Url = AtlasUrl
            print(AtlasName + ' URL:' + Page_Url)
            get_atlas_album_html(AtlasName,AtlasUrl,Page_Url)
        else:
            Page_Url = AtlasUrl +'index_' + str(page) + '.html'
            print(AtlasName + ' URL:' + Page_Url)
            get_atlas_album_html(AtlasName,AtlasUrl,Page_Url)
        time.sleep(1)
    return True
 
#获取套图集下的套图信息:2
def get_atlas_album_html(AtlasName,AtlasUrl,Page_Url):
    # 第二步：获取每页套图集信息：套图名称、套图链接、封面图片
    html = get_html(Page_Url)
    soup = BeautifulSoup(html, "lxml")
    db = mysqldb.Database()
    for ul in soup.find(class_='clearfix'):
        try:
            AlbumUrl = AtlasUrl + re.findall('\/(\w+.html)',ul.a['href'],re.S)[0]
AlbumPicDate = ul.b.string
            AlbumName = ul.span.string
            AlbumPicUrl = ul.img['lazysrc']
            sql = "insert into ttb_album(AtlasName,AlbumName,AlbumUrl,AlbumPicUrl,AlbumPicDate) values('%s','%s','%s','%s','%s')"%(AtlasName,AlbumName,AlbumUrl,AlbumPicUrl,AlbumPicDate)
            db.execute(sql)
print(AlbumName + ' URL:' + AlbumUrl + '   插入成功！')
except Exception:
            print(Exception)
    db.close()
    return True
 
#获取套图下的每套图片信息
def get_album_photo(AlbumName,AlbumUrl):
html = fread('ttb.html')
    soup = BeautifulSoup(html, "lxml")
 
    #获取第一页的图片信息与套图页数，将第一页信息插入数据库
    PhotoName = soup.img['alt']
    PhotoUrl = soup.img['lazysrc']
    PhtoNum = soup.find('span', id='allnum').get_text()
 
    db = mysqldb.Database()
    sql = "insert into ttb_photo(AlbumName,PhotoName,PhotoUrl) values('%s','%s','%s')" % (AlbumName, PhotoName, PhotoUrl)
    db.execute(sql)
    print("第1张：" + PhotoName + '  URL:' +PhotoUrl)
    db.close()
    # 获取后面页数的图片信息，插入数据库
    for i in range(2, int(PhtoNum) + 1):
        url = AlbumUrl[:-5] + "_" + format(i) + ".html"
        th = threading.Thread(target=get_img_insert, args=(i,AlbumName,url))
        # ts.append(th)
        th.start()
        time.sleep(0.5)
    return True
 
#插入图片信息
def get_img_insert(i,AlbumName,url):
    html = get_html(url)
    # html = fread('ttb.html')
    soup = BeautifulSoup(html, "lxml")
    PhotoName = soup.img['alt']
    PhotoUrl = soup.img['lazysrc']
    db = mysqldb.Database()
    print("第"+ format(i) +"张：" + PhotoName + '  URL:' +PhotoUrl)
    sql = "insert into ttb_photo(AlbumName,PhotoName,PhotoUrl) values('%s','%s','%s')"%(AlbumName,PhotoName,PhotoUrl)
    db.execute(sql)
    db.close()
    return
 
 
#======================================================
#获取网页信息，得到的html就是网页的源代码，传url，返回html
def get_html(url):
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Accept - Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36',
    }
    resp = requests.get(url,headers=headers)
    resp.encoding='utf-8'
    html = resp.text
return html

4、创建程序mysql_db.py(注意修改mysql连接的参数)

#!/usr/local/Cellar/python/3.7.1/bin
# -*- coding: UTF-8 -*-
import mysql.connector
import logging
 
# 加入日志
# 获取logger实例
logger = logging.getLogger("dbSql")
# 指定输出格式
formatter = logging.Formatter('%(asctime)s%(levelname)-8s:%(message)s')
 
#数据库操作类
class Database:
    # 构造函数
    def __init__(self):
        self._dbhost = 'localhost'  # 数据库主机地址
        self._dbuser = 'root'       # 数据库用户名
        self._dbpassword = 'root'   # 数据库密码
        self._dbname = 'lk'         # 数据库名称
        self._dbcharset = 'utf8'    # 数据库编码
        self._conn = self.connectMysql()
        if (self._conn):
            self._cursor = self._conn.cursor()
 
    # 数据库连接
    def connectMysql(self):
        conn =False
        try:
            # self._conn = mysql.connector.connect(
            conn = mysql.connector.connect(
                        host=self._dbhost,
                        user=self._dbuser,
                        passwd=self._dbpassword,
                        database=self._dbname,
                        charset=self._dbcharset,
                        )
        except Exception:
            # self._logger.error("connect database failed, %s" % data)
            logger.error("connect database failed!")
            conn =False
        # self._cursor = self._conn.cursor()
        return conn
 
    # 直接执行SQL语句
    def execute(self, sql):
        flag = False
        if (self._conn):
            try:
                self._cursor.execute(sql)
                self._conn.commit()
                flag = True
            except Exception:
                flag = False
                logger.warning("update database exception SQL=" + sql)
        return flag
 
 
    # 查询所有数据，带字段名
    def fetch_all(self, sql):
        result = ''
        if (self._conn):
            try:
                self._cursor = self._conn.cursor(dictionary=True)
                self._cursor.execute(sql)
                result = self._cursor.fetchall()
            except Exception:
                result = False
                logger.warning("query database exception SQL=" + sql)
        return result
 
    # 查询所有数据，不带字段名
    def fetchall(self, sql):
        result = ''
        if (self._conn):
            try:
                self._cursor.execute(sql)
                result = self._cursor.fetchall()
            except Exception:
                result = False
                logger.warning("query database exception SQL=" + sql)
        return result
 
    # 查询一条数据，带字段名
    def fetch_one(self, sql):
        result = ''
        if (self._conn):
            try:
                self._cursor = self._conn.cursor(dictionary=True)
                self._cursor.execute(sql)
                result = self._cursor.fetchone()
            except Exception:
                result = False
                logger.warning("query database exception SQL=" + sql)
        return result
 
    # 查询一条数据，不带字段名
    def fetchone(self, sql):
        result = ''
        if (self._conn):
            try:
                self._cursor.execute(sql)
                result = self._cursor.fetchone()
            except Exception:
                result = False
                logger.warning("query database exception SQL=" + sql)
        return result
 
    # 关闭数据库连接
    def close(self):
        # 如果数据打开，则关闭；否则没有操作
        if (self._conn):
            try:
                if (type(self._cursor) == 'object'):
                    self._cursor.close()
                if (type(self._conn) == 'object'):
                    self._conn.close()
            except Exception:
                # self._logger.warn("close database exception, %s,%s,%s" % (data, type(self._cursor), type(self._conn)))
                logger.warning("close database exception,%s,%s" % ( type(self._cursor), type(self._conn)))
        return True

本文最后更新于2019年12月26日，若涉及的内容可能已经失效，直接留言反馈补链即可，我们会处理，谢谢