当前位置:网站首页>Crawl Douban to read top250 and import it into SqList database (or excel table)
Crawl Douban to read top250 and import it into SqList database (or excel table)
2022-06-26 18:12:00 【Little fox dreams of going to fairy tale town】
Climb and take Douban to read Top250, Import sqlist database ( or excel form ) in
For source code, please visit https://github.com/zhang020801/douban_bookTop250
One 、 Program source code
import re # Regular expressions
from bs4 import BeautifulSoup # Extract the data
import urllib.request,urllib.error # Request access to web page , Return to the web page source code
import xlwt # Save data to excel form
import sqlite3 # Save data to sqlist In the database
def main():
# Declare which pages to crawl
baseurl = "https://book.douban.com/top250?start="
# get data
datalist = getData(baseurl)
#print(datalist)
# Save the data
#savepath = " Douban studies Top250.xls"
dbpath = "book.db"
#saveData(datalist,savepath)
saveData2(datalist,dbpath)
# Regular expressions
findlink = re.compile(r'<a href="(.*?)" οnclick=".*?" title=".*?">') # Book links
findtitle = re.compile(r'<a href=".*?" οnclick=".*?" title="(.*?)">') # Book name
findimglink = re.compile(r'<img src="(.*?)" width="90"/>') # Cover link
findauthor = re.compile(r'<p class="pl">(.*?) / (.*?) / .*? / .*?/.*?</p>') # author / translator
findpress = re.compile(r'<p class="pl">.*? / .*? / (.*?) / .*?/.*?</p>') # Press.
findtime = re.compile(r'<p class="pl">.*? / .*? / .*? / (.*?) / .*?</p>') # Publication date
findmoney = re.compile(r'<p class="pl">.*? / .*? / .*? / .*? / (.*?)</p>') # Book price
findscore = re.compile(r'<span class="rating_nums">(.*?)</span>') # score
findpeople = re.compile(r'<span class="pl">.*?(.*?) People comment on .*?</span>',re.S) # Number of evaluators
findjieshao = re.compile(r'<span class="inq">(.*?)</span>') # Introduce
def getData(baseurl):
datalist = []
for i in range(0,10):
url = baseurl + str(i*25)
html = askURL(url)
#print(html)
soup = BeautifulSoup(html,"html.parser")
for item in soup.find_all('table',width="100%"):
item = str(item) # convert to str Format
#print(item)
data = []
title = re.findall(findtitle, item)[0]
#print(title)
data.append(title)
score = re.findall(findscore,item)[0]
#print(score)
data.append(score)
link = re.findall(findlink,item)[0]
#print(link)
data.append(link)
imglink = re.findall(findimglink,item)[0]
#print(imglink)
data.append(imglink)
author = re.findall(findauthor,item)
if len(author)==0:
author = re.findall(r'<p class="pl">(.*?) / .*? / .*?</p>',item)
author = author[0]
#print(author)
data.append(author)
press = re.findall(findpress,item)
if len(press)==0:
press = re.findall(r'<p class="pl">.*? / (.*?) / .*? / .*?</p>',item)
if len(press)==0:
press = " "
else:press = press[0]
#print(press)
data.append(press)
time = re.findall(findtime,item)
if len(time)==0:
time = re.findall(r'<p class="pl">.*? / .*? / (.*?) / .*?</p>',item)
if len(time)==0:
time = " "
else:time = time[0]
#print(time)
data.append(time)
money = re.findall(findmoney,item)
if len(money)==0:
money = re.findall(r'<p class="pl">.*? / .*? / .*?/ (.*?)</p>',item)
if len(money)==0:
money = " "
else:money = money[0]
#print(money)
data.append(money)
people = re.findall(findpeople,item)
#people = people[0].replace(" ","")
people = people[0].replace("(\n ","")
#print(people)
data.append(people)
jieshao = re.findall(findjieshao,item)
if len(jieshao)==0:
jieshao = " "
jieshao = jieshao[0]
#print(jieshao)
data.append(jieshao)
datalist.append(data)
return datalist
def askURL(url):
head = {
"User-Agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 80.0.3987.163Safari / 537.36"
}
request = urllib.request.Request(url,headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return html
def saveData(datalist,savepath):
print(" Start saving ...")
book = xlwt.Workbook(encoding="utf-8",style_compression=0)
sheet = book.add_sheet(' Douban studies Top250',cell_overwrite_ok=True)
col = (" Book name "," score "," Book links "," Cover image link "," author / translator "," Press. "," Publication date "," The price is "," Number of evaluators "," Brief introduction ")
for i in range(0,10):
sheet.write(0,i,col[i])
for i in range(0,250):
print(" The first %d strip "%(i+1))
data = datalist[i]
for j in range(0,10):
sheet.write(i+1,j,data[j])
book.save(savepath)
print(" Save complete ")
def saveData2(datalist,dbpath):
init_db(dbpath)
conn = sqlite3.connect(dbpath)
cur = conn.cursor()
for data in datalist:
# for index in range(len(data)):
# if index==1 or index==8:
# continue
# else:data[index] = '"' + data[index] + '"'
sql = ''' insert into book250( title,score,book_link,Img_link,author,press,time,money,num,jieshao) values ("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")'''%(data[0],data[1],data[2],data[3],data[4],data[5],data[6],data[7],data[8],data[9])
print(sql)
cur.execute(sql)
conn.commit()
cur.close()
conn.close()
def init_db(dbpath):
sql = ''' create table book250 ( id integer primary key autoincrement, title varchar , score numeric , book_link text, Img_link text, author text, press text, time text, money text, num numeric , jieshao text ) '''
conn = sqlite3.connect(dbpath)
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
conn.close()
if __name__ =="__main__":
main()
Two 、 Program run results
1) Import data into excel In the table
2) Import data into sqlist In the database
边栏推荐
猜你喜欢
Detailed explanation of asymmetric cryptosystem
Number of solutions for knapsack problem
MYSQL的下载与配置 mysql远程操控
Tsinghua & Shangtang & Shanghai AI & CUHK proposed Siamese image modeling, which has both linear probing and intensive prediction performance!
DoS及攻击方法详解
RSA concept explanation and tool recommendation - LMN
Applet setting button sharing function
JVM entry door (1)
in和exsits、count(*)查询优化
Chinese (Simplified) language pack
随机推荐
请指教同花顺开户选选择哪家券商比较好?现在在线开户安全么?
DoS及攻擊方法詳解
新手炒股开户选哪个证券公司比较好?怎样炒股比较安全??
Do you know how to compare two objects
博云,站在中国容器潮头
RuntimeError: CUDA error: out of memory自己的解决方法(情况比较特殊估计对大部分人不适用)
Handwritten promise all
Decision tree and random forest
将字符串B插入字符串A,有多少种插入办法可以使新串是一个回文串
PC端录制扫515地机器人/scan数据
临时关闭MySQL缓存
比较两个对象的大小关系原来可以如此花里胡哨
LeetCode 128最长连续序列
零时科技 | 智能合约安全系列文章之反编译篇
深层次安全定义剖析及加密技术
Digital signature standard (DSS)
Let torch cuda. is_ Experience of available() changing from false to true
贝叶斯网络详解
RSA concept explanation and tool recommendation - LMN
pycharm的plt.show()如何保持不关闭