Sngr

用python采集需登录的dz论坛完整代码
Days before i have use shell code to collect a forum data...
扫描右侧二维码阅读全文
02
2018/10

用python采集需登录的dz论坛完整代码

Days before i have use shell code to collect a forum data but it is too slow. so i used python instead.
now it work basically. but because of the fast speed, ip is forbidden by the forum. but i think there must be a way to solve it.
All code as follows:

#coding:utf-8
import pycurl
import StringIO
import urllib
import time
import re
from bs4 import BeautifulSoup  
def initCurl():
        c = pycurl.Curl()
        c.setopt(pycurl.COOKIE, "cooke")
        #c.setopt(pycurl.COOKIEJAR, "cookie.txt")
        c.setopt(pycurl.FOLLOWLOCATION, 1) 
        c.setopt(pycurl.MAXREDIRS, 5)
        return c
def GetDate(curl, url):
        head = ['Accept:*/*',
                'User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36']
        buf = StringIO.StringIO()
        curl.setopt(pycurl.WRITEFUNCTION, buf.write)
        curl.setopt(pycurl.URL, url)
        curl.setopt(pycurl.HTTPHEADER,  head)
        curl.perform()
        the_page =buf.getvalue()
        buf.close()
        return the_page
def saveFile(data,filename):
    save_path = filename
    f_obj = open(save_path, 'w')
    f_obj.write(data)
    f_obj.close()
#不存在的主题写入list文件
def saveDele(id):
    save_path = 'delete.list'
    f_obj = open(save_path, 'a')
    f_obj.write(str(id) + "\n")
    f_obj.close()
c = initCurl()
#设置初始ID,从第一帖开始就设置为1
id = 1
#设置结束ID,假设100
while (id < 1000):
   print '开始采集帖子id为:', id
   page = 1
   i= 2
   while (i == 2):
        url = 'http://www.forum-name.com/viewthread.php?tid=' + str(id) + '&page=' + str(page)
        html = GetDate(c, url)
        filename = str(id) + '-' + str(page) + '.html'
        #保存网页为ID名字
        saveFile(html,filename)
        #利用美丽汤处理网页内容判断
        soup=BeautifulSoup(html, "lxml") 
        # sngr.org 很累,休息1秒 :( 
        time.sleep(1)
        #判断是否存下下一页,如果有,i值为2
        dict = soup.find_all("a", text="下一页", class_="next")
        i = len(dict)
        #判断是否有报警告信息,如果有,则主题不存在,写入list
        deletedict = soup.find_all("div", class_="alert_error")
        delelen=len(deletedict)
        if (delelen == 1):
            print "主题不存在,写入deleteid.list"
            saveDele(id)
        #print i
        #这里是摘取主题发布时间,打印出来掌握采集进度
        try:
                #不存在的标签
                em = soup.find("em", id=re.compile("autho")).get_text()
        except AttributeError as e:
                print "此主题不正常咯"
        else:
                if em == None:
                    print "此主题不正常咯"
                else:
                    print em
        #此页完成,页面无脑加1,
        page = page + 1
        #print page
   edict = soup.find_all("div", class_="alert_act")
   errorlen=len(edict)
   #判断如果有警告act,说明登录已退出了,就break断开吧
   if (errorlen == 1):
       print "出现未登陆提示,break!"
       break
   id = id + 1
   # https://sngr.org 休息1s继续。
   time.sleep(1)
   #print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
#id大于设定值时,就停止~
print "id已超出设定,finish!"

maybe next i can use proxy ip to solve the ip issue, here are some web page maybe useful :
ref: http://bigwayseo.com/512 & https://blog.csdn.net/xsj_blog/article/details/52102652 & https://junyiseo.com/python/607.html & https://flfq.peuland.com/index.php/2014/07/30/pycurl%E4%BD%BF%E7%94%A8socks5%E4%BB%A3%E7%90%86%E5%8A%9F%E8%83%BD/ & http://panweizeng.com/python-urllib2-socks-proxy.html

Last modification:October 2nd, 2018 at 05:34 pm
If you think my article is useful to you, please feel free to appreciate

2 comments

  1. Hoe

    这个很强啊,如果登录时遇到图片验证码怎么办啊

    1. Sngr
      @Hoe

      这个是带了cookie的,比如从浏览器里面提取出来的cookie直接用,没有登录这个过程。
      如果需要python里面去登录再保存cookie直接用,还不会 hoho···

Leave a Comment