import requests
from bs4 import BeautifulSoup
import time
import os
import openpyxl
import datetime
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4146.4 Safari/537.36'
}
#函数功能:返回一个二维列表,列表的每个元素是一个版面所有文章标题构成的列表(还有一些无用的信息)。
def getTitleUrl(html):
#拿到巴中日报每个版面的链接,装入一个列表
title_url=[]
banmian_url=html.select("div.bm_table>li>a")
#遍历版面的链接,拿到1个版面文章标题的链接,装入一个列表
for i in range(len(banmian_url)):
banmian_url[i]="http://szb.bznews.org/news_page/"+banmian_url[i]["href"]
r=requests.get(banmian_url[i])
r.encoding="utf-8"
html=BeautifulSoup(r.text,"html.parser")
title_url_a=html.select("div.center_table_left>div.words_content_table>li>a")
title_url.append(title_url_a)
# 按每个版面一个列表,拿到每篇文章的链接
for x in title_url:
for y in range(len(x)):
x[y]="http://szb.bznews.org/news_page/"+x[y]["href"]
return title_url
def getDateList(start_date, end_date):
date_list = []
a_guding_url="http://szb.bznews.org/news_page/index.html"
start_date = datetime.datetime.strptime(start_date, '%Y%m%d')
end_date = datetime.datetime.strptime(end_date, '%Y%m%d')
date_list.append(start_date.strftime('%Y%m%d'))
while start_date < end_date:
start_date += datetime.timedelta(days=1)
date_list.append(start_date.strftime('%Y%m%d'))
for i in range(len(date_list)):
a_user_date = date_list[i]
date_list[i] = a_guding_url+'?sDate='+a_user_date[:4]+'-'+a_user_date[4:6]+'-'+a_user_date[6:]
return date_list
#函数功能:根据文章链接获取文章内容
def getArticle(article_url):
info=[]
time.sleep(1)
r=requests.get(article_url,headers=headers)
html=BeautifulSoup(r.text,"html.parser")
title2=html.select("div.content_title")
content=html.select("div.content_con")
mystr=str(content)
if(-1!=mystr.find("恩阳")):
if(title2[0].string==None):
print(article_url+" 没有标题的图片新闻,自己看!")
info.append("(图片新闻)")
else:
print(article_url+" ----->找到相关信息!-->"+title2[0].string)
info.append(title2[0].string)
info.append(article_url)
else:
print("在"+article_url+"没有相关信息!")
return info
def main():
wb = openpyxl.Workbook()
ws = wb.create_sheet(index=0, title="巴中日报")
myinfo=[]
# url=sCreateLink() #根据日期构造当天日报的首页链接
userdate = input("输入开始日期和结束日期,格式为 YYYYMMDD-YYYYMMDD :")
all_url=getDateList(userdate[0:8],userdate[9:17])
for url in all_url:
r=requests.get(url) #请求首页并解析
r.encoding="utf-8"
if(200!=r.status_code):
print("请求页面失败!")
html=BeautifulSoup(r.text,"html.parser")
daily=getTitleUrl(html) #拿到当天报纸所有文章的链接,根据版面分类构造一个二维列表
for a in daily: #遍历这个二维列表的元素并访问文章
for b in a:
c = getArticle(b)
if c: #判断函数的返回值,如果是空则不加入到列表中
myinfo.append(c)
# myinfo.append(getArticle(b)) #提取需要统计的信息
for item in myinfo:
ws.append(item)
wb.save("巴中日报爬虫结果.xlsx")
print(myinfo)
return 0
main()
# 引入库
from bs4 import BeautifulSoup
import requests
import time
import os
import openpyxl
import datetime
def getDateList(start_date, end_date):
date_list = []
a_guding_url="https://epaper.scdaily.cn/shtml/scrb/"
start_date = datetime.datetime.strptime(start_date, '%Y%m%d')
end_date = datetime.datetime.strptime(end_date, '%Y%m%d')
date_list.append(start_date.strftime('%Y%m%d'))
while start_date < end_date:
start_date += datetime.timedelta(days=1)
date_list.append(start_date.strftime('%Y%m%d'))
for i in range(len(date_list)):
a_user_date = date_list[i]
date_list[i] = a_guding_url+a_user_date
return date_list
userdate = input("输入开始日期和结束日期,格式为 YYYYMMDD-YYYYMMDD :")
all_url=getDateList(userdate[0:8],userdate[9:17])
def pachong(url):
keyiword="恩阳"
print("查找的关键词是:",keyiword)
#爬取页面
r=requests.get(url)
r.encoding="gb2312"
#做成美丽汤
html=BeautifulSoup(r.text,"html.parser")
#函数功能:返回一个列表,列表内容是当天日报所有文章的标题链接
def getTitleUrl(html):
title_url=[]
banmian_url=html.select("a[target='_self']")
#从版面的链接中得到标题的链接
for i in range(len(banmian_url)):
banmian_url[i]="https://epaper.scdaily.cn/"+banmian_url[i]["href"]
r = requests.get(banmian_url[i])
r.encoding="gb2312"
html=BeautifulSoup(r.text,"html.parser")
title_url_a=html.select("a.title_art")
for j in range(len(title_url_a)):
title_url.append("https://epaper.scdaily.cn/"+title_url_a[j]["href"])
return title_url
my_title_url=getTitleUrl(html)
print(my_title_url)
for myurl in my_title_url:
r = requests.get(myurl)
r.encoding="gb2312"
html=BeautifulSoup(r.text,"html.parser")
article=html.select("div.main2_r>ul.news")
mystr=str(article)
# print(mystr)
if(-1!=mystr.find(keyiword)):
print(myurl+" ----->找到相关信息!")
else:
pass
# print("在"+myurl+"没有相关信息!")
time.sleep(1)
for oneurl in all_url:
pachong(oneurl)