Days before i have use shell code to collect a forum data but it is too slow. so i used python instead.
now it work basically. but because of the fast speed, ip is forbidden by the forum. but i think there must be a way to solve it.
All code as follows:
#coding:utf-8
import pycurl
import StringIO
import urllib
import time
import re
from bs4 import BeautifulSoup
def initCurl():
c = pycurl.Curl()
c.setopt(pycurl.COOKIE, "cooke")
#c.setopt(pycurl.COOKIEJAR, "cookie.txt")
c.setopt(pycurl.FOLLOWLOCATION, 1)
c.setopt(pycurl.MAXREDIRS, 5)
return c
def GetDate(curl, url):
head = ['Accept:*/*',
'User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36']
buf = StringIO.StringIO()
curl.setopt(pycurl.WRITEFUNCTION, buf.write)
curl.setopt(pycurl.URL, url)
curl.setopt(pycurl.HTTPHEADER, head)
curl.perform()
the_page =buf.getvalue()
buf.close()
return the_page
def saveFile(data,filename):
save_path = filename
f_obj = open(save_path, 'w')
f_obj.write(data)
f_obj.close()
#不存在的主题写入list文件
def saveDele(id):
save_path = 'delete.list'
f_obj = open(save_path, 'a')
f_obj.write(str(id) + "\n")
f_obj.close()
c = initCurl()
#设置初始ID,从第一帖开始就设置为1
id = 1
#设置结束ID,假设100
while (id < 1000):
print '开始采集帖子id为:', id
page = 1
i= 2
while (i == 2):
url = 'http://www.forum-name.com/viewthread.php?tid=' + str(id) + '&page=' + str(page)
html = GetDate(c, url)
filename = str(id) + '-' + str(page) + '.html'
#保存网页为ID名字
saveFile(html,filename)
#利用美丽汤处理网页内容判断
soup=BeautifulSoup(html, "lxml")
# sngr.org 很累,休息1秒 :(
time.sleep(1)
#判断是否存下下一页,如果有,i值为2
dict = soup.find_all("a", text="下一页", class_="next")
i = len(dict)
#判断是否有报警告信息,如果有,则主题不存在,写入list
deletedict = soup.find_all("div", class_="alert_error")
delelen=len(deletedict)
if (delelen == 1):
print "主题不存在,写入deleteid.list"
saveDele(id)
#print i
#这里是摘取主题发布时间,打印出来掌握采集进度
try:
#不存在的标签
em = soup.find("em", id=re.compile("autho")).get_text()
except AttributeError as e:
print "此主题不正常咯"
else:
if em == None:
print "此主题不正常咯"
else:
print em
#此页完成,页面无脑加1,
page = page + 1
#print page
edict = soup.find_all("div", class_="alert_act")
errorlen=len(edict)
#判断如果有警告act,说明登录已退出了,就break断开吧
if (errorlen == 1):
print "出现未登陆提示,break!"
break
id = id + 1
# https://sngr.org 休息1s继续。
time.sleep(1)
#print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
#id大于设定值时,就停止~
print "id已超出设定,finish!"
maybe next i can use proxy ip to solve the ip issue, here are some web page maybe useful :
ref: http://bigwayseo.com/512 & https://blog.csdn.net/xsj_blog/article/details/52102652 & https://junyiseo.com/python/607.html & https://flfq.peuland.com/index.php/2014/07/30/pycurl%E4%BD%BF%E7%94%A8socks5%E4%BB%A3%E7%90%86%E5%8A%9F%E8%83%BD/ & http://panweizeng.com/python-urllib2-socks-proxy.html
这个是带了cookie的,比如从浏览器里面提取出来的cookie直接用,没有登录这个过程。
如果需要python里面去登录再保存cookie直接用,还不会 hoho···