基于Python的HTTPS协议模拟登陆+爬取页面
之前写的一直没成功,原因是用的不是HTTPS相关的函数。这次仔细研究了一下,有几个需要注意的点,一个是POST模拟登陆的时候,header中的cookie值,不同的网站应该会有不同的要求;另一个是GET页面的时候,是需要加上POST得到的response中的set-cookie的。这样才能利用登陆的成功。 写完POST和GET页面后,顺便写了个简单的命令行实现。import httplib, urllibimport urllib2
import cookielib
import sys
file_text = "build_change.txt"
resultTable = dict()
host = 'buuuuuuu.knight.com'
def Login(username, password , csrf ='Gy2O70iSjOTbWhWgBLvf4HDuf4jUe4RP'):
url = '/login/'
values = {
'username' : username,
'password' : password,
'next' : '',
'csrfmiddlewaretoken': csrf,
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded',
'Connection' : 'keep-alive',
'Cookie':'csrftoken=%s' % csrf ,
'Referer':'https://buuuuuuu.knight.com/login/',
'Origin':'https://buuuuuuu.knight.com',
'Content-Type':'application/x-www-form-urlencoded',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,p_w_picpath/webp,/;q=0.8',
}
values = urllib.urlencode(values)
conn = httplib.HTTPSConnection(host, 443)
conn.request("POST", url, values, headers)
response = conn.getresponse()
print 'Login: ', response.status, response.reason
'''
hdata = response.getheaders()
for i in xrange(len(hdata)):
for j in xrange(len(hdata)):
print hdata,
'''
return response.getheader("set-cookie")
def GetHtml(_url , cookie):
get_headers = {
'Host' : 'xxxxx.knight.com',
'Connection' : 'keep-alive' ,
'Cache-Control' : 'max-age=0',
'Cookie' : cookie ,
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,p_w_picpath/webp,*/*;q=0.8',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36',
'Accept-Language' : 'zh-CN,zh;q=0.8,en;q=0.6',
}
conn=httplib.HTTPSConnection(host)
conn.request("GET", _url,None,get_headers)
res2=conn.getresponse()
print "Get %s:" % _url ,res2.status, res2.reason
'''
hdata1 = res2.getheaders()
for i in xrange(len(hdata1)):
for j in xrange(len(hdata1)):
print hdata1,
'''
data = res2.read()
fp = open("build_change.txt","w")
fp.write(data)
fp.close()
def ParseHtml():
fp = open(file_text,"r")
content = fp.readline()
_pos = 0
while content:
if content.find("class=\"change-body\"") >= 0:
topic = content.split(">")
resultTable = topic
while content:
content = fp.readline()
resultTable = resultTable + content
if content.find("</div>")>= 0:
_pos = _pos + 1
break
content = fp.readline()
fp.close()
print "Parse html success."
def GenerateResultTxt():
f = open("build_change_result.txt","w")
for m in resultTable.keys():
f.write("-------------------------------------------------------------------------------------------\n")
f.write(resultTable)
f.close()
print "Generate result success : build_change_result.txt ."
def Help():
print '-h : help'
print '-u : username(must)'
print '-p : password(must)'
print '-c : csrftoken(optional)'
print '-s : sandbox build id(must)'
print 'For example:'
print 'python BuildChange.py -h'
print 'python BuildChang.py -u u -p p -s s1 s2'
print 'python BuildChang.py -u u -p p -c c -s s1 s2'
def ParseParam(com):
length = len(com)
username = ""
password = ""
csrf = ""
sid1 = ""
sid2 = ""
if length == 2 or length == 8 or length == 10:
if com == '-h':
Help()
for i in range(1,length):
if com == '-u' and i < (length-1):
username = com
i += 1
elif com == '-p' and i < (length-1):
password = com
i += 1
elif com == '-c' and i < (length-1):
csrf = com
i += 1
elif com == '-s' and i < (length-2):
sid1 = com
sid2 = com
i += 2
if username == "" or password == "" or sid1 == "" or sid2 == "":
print ' Parameter error!'
print ' You can use \"python BuildChange.py -h\" to see how can use this script. '
else:
if csrf == "":
cookie = Login(username, password)
else:
cookie = Login(username, password, csrf)
_url = "//changelog//between//%s//and//%s/" % (sid1, sid2)
GetHtml(_url, cookie)
ParseHtml()
GenerateResultTxt()
C:\Python27\python.exe C:\Users\knight\Desktop\build\BuildChange.py -u xux -p KKKKKKKK -s 1859409 1858525
if __name__ == "__main__":
ParseParam(sys.argv)
页:
[1]