爬虫更新1 – 学以致用

import requests
from bs4 import BeautifulSoup
import time
import os
import openpyxl
import datetime

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4146.4 Safari/537.36'
}


#函数功能：返回一个二维列表，列表的每个元素是一个版面所有文章标题构成的列表（还有一些无用的信息）。

def getTitleUrl(html):
#拿到巴中日报每个版面的链接,装入一个列表
    title_url=[]
    banmian_url=html.select("div.bm_table>li>a")

#遍历版面的链接，拿到1个版面文章标题的链接,装入一个列表
    for i in range(len(banmian_url)):
        banmian_url[i]="http://szb.bznews.org/news_page/"+banmian_url[i]["href"]
        r=requests.get(banmian_url[i])
        r.encoding="utf-8"
        html=BeautifulSoup(r.text,"html.parser")
        title_url_a=html.select("div.center_table_left>div.words_content_table>li>a")
        title_url.append(title_url_a)

# 按每个版面一个列表，拿到每篇文章的链接
        
    for x in title_url:
        for y in range(len(x)):
            x[y]="http://szb.bznews.org/news_page/"+x[y]["href"]
            
    return title_url

def getDateList(start_date, end_date):
    date_list = []
    a_guding_url="http://szb.bznews.org/news_page/index.html"
    start_date = datetime.datetime.strptime(start_date, '%Y%m%d')
    end_date = datetime.datetime.strptime(end_date, '%Y%m%d')
    date_list.append(start_date.strftime('%Y%m%d'))
    while start_date < end_date:
        start_date += datetime.timedelta(days=1)
        date_list.append(start_date.strftime('%Y%m%d'))
    for i in range(len(date_list)):
        a_user_date = date_list[i]
        date_list[i] = a_guding_url+'?sDate='+a_user_date[:4]+'-'+a_user_date[4:6]+'-'+a_user_date[6:]
    return date_list

#函数功能：根据文章链接获取文章内容

def getArticle(article_url):
    info=[]
    time.sleep(1)
    r=requests.get(article_url,headers=headers)

    html=BeautifulSoup(r.text,"html.parser")

    title2=html.select("div.content_title")

    content=html.select("div.content_con")

    mystr=str(content)

    if(-1!=mystr.find("恩阳")):
        if(title2[0].string==None):
            print(article_url+" 没有标题的图片新闻，自己看！")
            info.append("（图片新闻）")
        else:
            print(article_url+" ----->找到相关信息！-->"+title2[0].string)
            info.append(title2[0].string)
        info.append(article_url)
    else:
        print("在"+article_url+"没有相关信息！")
    
    return info

def main():
    wb = openpyxl.Workbook()
    ws = wb.create_sheet(index=0, title="巴中日报")
    myinfo=[]
    
#    url=sCreateLink() #根据日期构造当天日报的首页链接
    userdate = input("输入开始日期和结束日期,格式为 YYYYMMDD-YYYYMMDD ：")
    all_url=getDateList(userdate[0:8],userdate[9:17])
    
    for url in all_url:
        r=requests.get(url) #请求首页并解析
        r.encoding="utf-8"
        if(200!=r.status_code):
            print("请求页面失败！")
        html=BeautifulSoup(r.text,"html.parser")
        daily=getTitleUrl(html) #拿到当天报纸所有文章的链接，根据版面分类构造一个二维列表
        for a in daily: #遍历这个二维列表的元素并访问文章
            for b in a:
                c = getArticle(b)
                if c: #判断函数的返回值，如果是空则不加入到列表中
                    myinfo.append(c)
#                myinfo.append(getArticle(b)) #提取需要统计的信息
    for item in myinfo:
        ws.append(item)
        
    wb.save("巴中日报爬虫结果.xlsx")
    print(myinfo)
    return 0

main()

# 引入库
from bs4 import BeautifulSoup
import requests
import time
import os
import openpyxl
import datetime

def getDateList(start_date, end_date):
    date_list = []
    a_guding_url="https://epaper.scdaily.cn/shtml/scrb/"
    start_date = datetime.datetime.strptime(start_date, '%Y%m%d')
    end_date = datetime.datetime.strptime(end_date, '%Y%m%d')
    date_list.append(start_date.strftime('%Y%m%d'))
    while start_date < end_date:
        start_date += datetime.timedelta(days=1)
        date_list.append(start_date.strftime('%Y%m%d'))
    for i in range(len(date_list)):
        a_user_date = date_list[i]
        date_list[i] = a_guding_url+a_user_date
    return date_list

userdate = input("输入开始日期和结束日期,格式为 YYYYMMDD-YYYYMMDD ：")
all_url=getDateList(userdate[0:8],userdate[9:17])

def pachong(url):
    keyiword="恩阳"
    print("查找的关键词是：",keyiword)

    #爬取页面
    r=requests.get(url)
    r.encoding="gb2312"
    #做成美丽汤
    html=BeautifulSoup(r.text,"html.parser")

    #函数功能：返回一个列表，列表内容是当天日报所有文章的标题链接
    def getTitleUrl(html):
        title_url=[]
        banmian_url=html.select("a[target='_self']")
    #从版面的链接中得到标题的链接
        for i in range(len(banmian_url)):
            banmian_url[i]="https://epaper.scdaily.cn/"+banmian_url[i]["href"]
            r = requests.get(banmian_url[i])
            r.encoding="gb2312"
            html=BeautifulSoup(r.text,"html.parser")
            title_url_a=html.select("a.title_art")
            for j in range(len(title_url_a)):
                title_url.append("https://epaper.scdaily.cn/"+title_url_a[j]["href"])
        return title_url
    my_title_url=getTitleUrl(html)

    print(my_title_url)

    for myurl in my_title_url:
        r = requests.get(myurl)
        r.encoding="gb2312"
        html=BeautifulSoup(r.text,"html.parser")
        article=html.select("div.main2_r>ul.news")
        mystr=str(article)
    #    print(mystr)
        if(-1!=mystr.find(keyiword)):
            print(myurl+" ----->找到相关信息！")
        else:
            pass
#            print("在"+myurl+"没有相关信息！")
        time.sleep(1)

for oneurl in all_url:
    pachong(oneurl)

发送评论 编辑评论

发送评论编辑评论