- 判断文件或者文件夹是否存在
if(os.path.exists(rootdir) == False)
- 创建文件夹
os.mkdir(rootdir)
- 调用系统命令
os.system(cmd)
- 字典循环
for key,value in dict.items()
- 打开文件并读取内容进行处理
fd = open('xxxx.txt', encoding='utf-8')
for line in fd:
print line
fd.close()
- 创建文件并写入内容
fd = open('xxxx.txt', 'a+', encoding='utf-8')
fd.write('aaaaa' + '\n')
fd.close()
- 使用xlrd读取EXCEL
导入
import xlrd
打开excel
data = xlrd.open_workbook('demo.xls') #注意这里的workbook首字母是小写
查看文件中包含sheet的名称
data.sheet_names()
得到第一个工作表,或者通过索引顺序 或 工作表名称
table = data.sheets()[0]
table = data.sheet_by_index(0)
table = data.sheet_by_name(u'Sheet1')
获取行数和列数
nrows = table.nrows
ncols = table.ncols
获取整行和整列的值(数组)
table.row_values(i)
table.col_values(i)
循环行,得到索引的列表
for rownum in range(table.nrows):
print table.row_values(rownum)
单元格
cell_A1 = table.cell(0,0).value
cell_C4 = table.cell(2,3).value
分别使用行列索引
cell_A1 = table.row(0)[0].value
cell_A2 = table.col(1)[0].value
简单的写入
row = 0
col = 0
ctype = 1 # 类型 0 empty,1 string, 2 number, 3 date, 4 boolean, 5 error
value = 'lixiaoluo'
xf = 0 # 扩展的格式化 (默认是0)
table.put_cell(row, col, ctype, value, xf)
table.cell(0,0) # 文本:u'lixiaoluo'
table.cell(0,0).value # 'lixiaoluo'
- 使用xlwt写入EXCEL
导入xlwt
import xlwt
新建一个excel文件
file = xlwt.Workbook() #注意这里的Workbook首字母是大写,无语吧
新建一个sheet
table = file.add_sheet('sheet name')
写入数据table.write(行,列,value)
table.write(0,0,'test')
如果对一个单元格重复操作,会引发
returns error:
# Exception: Attempt to overwrite cell:
# sheetname=u'sheet 1' rowx=0 colx=0
所以在打开时加cell_overwrite_ok=True解决
table = file.add_sheet('sheet name',cell_overwrite_ok=True)
保存文件
file.save('demo.xls')
另外,使用style
style = xlwt.XFStyle() #初始化样式
font = xlwt.Font() #为样式创建字体
font.name = 'Times New Roman'
font.bold = True
style.font = font #为样式设置字体
table.write(0, 0, 'some bold Times text', style) # 使用样式
- 命令行getopt
try:
options,args = getopt.getopt(sys.argv[1:],"hp:i:",["help","ip=","port="])
except getopt.GetoptError:
sys.exit()
for name,value in options:
if name in ("-h","--help"):
usage()
if name in ("-i","--ip"):
print(value)
if name in ("-p","--port"):
print(value)
- 简单爬虫
import requests
AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
HEADERS = {
'User-Agent': AGENT,
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'X-Requested-With':'XMLHttpRequest',
'Accept':'*/*'
session = requests.session()
#模拟登录
postdata = {
'defaults':'xxx',
'fromLogin':'xxx',
'userName':'xxx',
'password':'xxxx'
}
url = 'xxxxxxxx'
login_info = session.post(url, headers = HEADERS, data = postdata,verify = False)
if(login_info.status_code == requests.codes.ok):
print('login success')
return True
else:
print('login err')
return False
}
#下载html页面
def downloadUrl(rootdir, url, orgid, page):
html = session.get(url, headers=global_config.HEADERS, verify=False)
if(html.text[1:7] == 'script'):
print(html.text)
return "err"
if(len(html.text) < 60):
return "err"
sample = open(rootdir + "/" + str(orgid) + '_' + str(page) + ".html", "w", encoding='utf-8')
sample.write(html.text)
sample.close()
return 'ok'
- 解析JOSN文件内容
def scrapy_by_file(json_file_name):
#读取JSON文件的内容
text = open(json_file_name, encoding='utf-8').read()
#特殊处理,去除从WINDOWS系统带过来的BOM特殊字符
if text.startswith(u'\ufeff'):
text = text.encode('utf8')[3:].decode('utf8')
#将文本内容的JSON数据转换成自定义的JSON对象
try:
json_data = json.loads(text)
except:
print(json_file_name)
return
for row in json_data['rows']:
def scrapy_by_row(row):
try:
orgid = row['organization']['id']
familyid = row['censusRegisterFamily']['id']
except:
print('errrr')
return
scrapy_by_row(row)
- 遍历文件夹
#遍历目录(rootdir) 遍历到的每个文件都执行dirFunc
def waklThroughDir(rootdir, dirFunc):
for parent, dirnames, filenames in os.walk(rootdir):
for filename in filenames:
print(filename)
#获取后缀为txt的文件
if(filename.split('.')[-1] == 'html'):
dirFunc(os.path.join(parent, filename))
- 采集温州房产网基本信息
# -*- coding: utf-8 -*-
import re
import requests
import time
#-----------------------------用于解析的正则表达式常量------------------------------------------------------------------
#解析页数
PAGE_NUM = '共找到 (.*?) 符合条件的记录'
#解析小区名称
NAME = 'texttext_title"><ahref(.*?)</a></div><divclass="texttext_moreinfo">'
#解析小区价格
PRICE = 'class="hot_price">(.*?)</span>'
#解析小区地址
ADDRESS = 'text_moreinfo">(.*?)</div><divclass="texttext_moreinfo"><span>'
#文件生成路径
ROOTDIR = 'F:\\test\\'
#-----------------------------模拟请求的头部信息,否则将被识别出是程序抓包而被拦截--------------------------------------
HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36',
'Host': 'www.0577home.net',
'Upgrade-Insecure-Requests': '1'
}
#-----------------------------抓取某一页的房产信息,pageNo为页号--------------------------------------------------------
def getHouseListByPageno(pageNo):
#建立一个连接用于后续发起请求
session = requests.session()
url = 'http://www.0577home.net/xiaoqu/list_0_0_0_0_0_0_0_' + str(pageNo) + '.html'
houseList = session.get(url, headers = HEADERS, verify = False)
#以写入模式打开文件
fh = open(ROOTDIR + "houseList_pageNo" + str(pageNo) + ".txt", 'w' ,encoding='utf-8')
#将movieList写入文件
fh.write(houseList.text)
#关闭文件
fh.close()
#-------------------------------获取需要抓取的页面总数------------------------------------------------------------------
def getPageNum():
#打开已经下载好的第一页房产内容
f = open(ROOTDIR + 'houseList_pageNo1.txt', encoding='utf-8')
#获取文件内容
rawContent = f.read()
#用正则表达式解析页面内容
pageNum = re.findall(PAGE_NUM, rawContent)
#返回页面号
return int(pageNum[0]) / 20 + 1
def parseHouseListToFile(srcFile, dstFile):
#打开待解析的文件
f = open(srcFile, encoding='utf-8')
#读取文件内容以备解析
rawContent = f.read()
p = re.compile('\s+')
content = re.sub(p, '', rawContent)
dnames = re.findall(NAME, content)
names = []
for dname in dnames:
idx = dname.rfind('>')
names.append(dname[idx + 1:])
prices = re.findall(PRICE, content)
daddress = re.findall(ADDRESS, content)
address = []
for daddr in daddress:
id = daddr.rfind('>')
address.append(daddr[id + 1:])
i = 0
for x in names:
#写入时用'$'做分割,结尾加上回车符
dstFile.write(names[i] + '$' + prices[i] + '$' + address[i] + '\n')
i = i + 1
#-------------------------------主函数,下载并解析房产信息--------------------------------------------------------------
if __name__ == '__main__':
#---------------------抓取页面-----------------------------
#抓取第一页房产信息
getHouseListByPageno(1)
#通过第一页房产信息获取总共要抓取的页面数量
pageNum = getPageNum()
#抓取剩余的页面
for i in range(2, int(pageNum) + 1):
getHouseListByPageno(str(i))
#---------------------解析页面-----------------------------
#获取当前年月日
localtime = time.strftime('%Y%m%d', time.localtime(time.time()))
#创建一个文件,文件名前面带上年月日
f = open(ROOTDIR + localtime + '_houseList.txt', 'a+', encoding='utf-8')
#解析所有的页面
#for k in range(1, int(pageNum) + 1):
for k in range(1, 115):
parseHouseListToFile(ROOTDIR + "houseList_pageNo" + str(k) + ".txt", f)
#关闭文件
f.close()
- 采集温州房产网详细信息
# -*- coding: utf-8 -*-
import re
import requests
import time
import os
#-----------------------------用于解析的正则表达式常量------------------------------------------------------------------
#解析页数
PAGE_NUM = '共找到 (.*?) 符合条件的记录'
#解析小区名称
NAME = 'texttext_title"><ahref(.*?)</a></div><divclass="texttext_moreinfo">'
#解析小区价格
PRICE = 'class="hot_price">(.*?)</span>'
#解析小区地址
ADDRESS = 'text_moreinfo">(.*?)</div><divclass="texttext_moreinfo"><span>'
#解析小区编号
ID = 'class="picdiv_left"><ahref="http://www.0577home.net/xiaoqu/(.*?).html'
#解析小区所属区域
LOCATION = '<div><a>所属区域:</a><span>(.*?)</span></div>'
#解析小区占地面积
AREA = '<div><a>占地面积:</a><span>(.*?)</span></div>'
#解析小区绿化率
GREENINGRATE = '<div><a>绿化率:</a><span>(.*?)</span></div>'
#解析小区楼总数
LAYER = '<div><a>楼总数:</a><span>(.*?)</span></div>'
#解析小区物业类型
TYPE = '<div><a>物业类型:</a><span>(.*?)</span></div>'
#解析小区所属小学
PRIMARYSCHOOL = '<div><a>所属小学:</a><span>(.*?)</span></div>'
#解析小区总建筑面积
BUILDINGAREA = '<div><a>总建筑面积:</a><span>(.*?)</span></div>'
#解析小区容积率
PLOTRATIO = '<div><a>容积率:</a><span>(.*?)</span></div>'
#解析小区开发商
DEVEPLOPER = '<div><a>开发商:</a><span>(.*?)</span></div>'
#文件生成路径
ROOTDIR = 'F:\\test\\'
#-----------------------------模拟请求的头部信息,否则将被识别出是程序抓包而被拦截--------------------------------------
HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36',
'Host': 'www.0577home.net',
'Upgrade-Insecure-Requests': '1'
}
#-----------------------------抓取某一页的房产信息,pageNo为页号--------------------------------------------------------
def getHouseListByPageno(pageNo):
#建立一个连接用于后续发起请求
session = requests.session()
url = 'http://www.0577home.net/xiaoqu/list_0_0_0_0_0_0_0_' + str(pageNo) + '.html'
houseList = session.get(url, headers = HEADERS, verify = False)
#以写入模式打开文件
fh = open(ROOTDIR + "houseList_pageNo" + str(pageNo) + ".txt", 'w' ,encoding='utf-8')
#将movieList写入文件
fh.write(houseList.text)
#关闭文件
fh.close()
def getHouseInfoByPageno(pageNo, k):
if(os.path.exists(ROOTDIR + "houseInfo_pageNo" + str(pageNo) + ".html")):
return
print('downloading !, count %s, page %s' % (str(k), str(pageNo)))
#建立一个连接用于后续发起请求
session = requests.session()
url = 'http://www.0577home.net/xiaoqu/detail_' + str(pageNo) + '.html'
houseList = session.get(url, headers = HEADERS, verify = False)
#以写入模式打开文件
fh = open(ROOTDIR + "houseInfo_pageNo" + str(pageNo) + ".html", 'w' ,encoding='utf-8')
#将movieList写入文件
fh.write(houseList.text)
#关闭文件
fh.close()
#-------------------------------获取需要抓取的页面总数------------------------------------------------------------------
def getPageNum():
#打开已经下载好的第一页房产内容
f = open(ROOTDIR + 'houseList_pageNo1.txt', encoding='utf-8')
#获取文件内容
rawContent = f.read()
#用正则表达式解析页面内容
pageNum = re.findall(PAGE_NUM, rawContent)
#返回页面号
return int(pageNum[0]) / 20 + 1
def parseHouseInfo(srcFile):
#打开待解析的文件
f = open(srcFile, encoding='utf-8')
#读取文件内容以备解析
content = f.read()
# p = re.compile('\s+')
# content = re.sub(p, '', rawContent)
location = re.findall(LOCATION, content)[0]
location = location.split(' ')
category1 = location[0]
category2 = location[1]
area = re.findall(AREA, content)[0]
greeningrate = re.findall(GREENINGRATE, content)[0]
layer = re.findall(LAYER, content)[0]
type = re.findall(TYPE, content)[0]
primaryschool = re.findall(PRIMARYSCHOOL, content)[0]
buildingarea = re.findall(BUILDINGAREA, content)[0]
plotratio = re.findall(PLOTRATIO, content)[0]
developer = re.findall(DEVEPLOPER, content)[0]
f.close()
return (category1, category2, area, greeningrate, layer, type, primaryschool, buildingarea, plotratio, developer)
def parseHouseListToFile(srcFile, dstFile):
#打开待解析的文件
f = open(srcFile, encoding='utf-8')
#读取文件内容以备解析
rawContent = f.read()
p = re.compile('\s+')
content = re.sub(p, '', rawContent)
dnames = re.findall(NAME, content)
names = []
for dname in dnames:
idx = dname.rfind('>')
names.append(dname[idx + 1:])
prices = re.findall(PRICE, content)
daddress = re.findall(ADDRESS, content)
ids = re.findall(ID, content)
address = []
for daddr in daddress:
id = daddr.rfind('>')
address.append(daddr[id + 1:])
i = 0
f.close()
for x in names:
#写入时用'$'做分割,结尾加上回车符
dstFile.write(names[i] + '$' + prices[i] + '$' + address[i] + '$' + ids[i] + '\n')
i = i + 1
#-------------------------------主函数,下载并解析房产信息--------------------------------------------------------------
if __name__ == '__main__':
#---------------------抓取页面-----------------------------
#抓取第一页房产信息
# getHouseListByPageno(1)
# #通过第一页房产信息获取总共要抓取的页面数量
# pageNum = getPageNum()
# #抓取剩余的页面
# for i in range(2, int(pageNum) + 1):
# getHouseListByPageno(str(i))
#---------------------解析页面-----------------------------
#获取当前年月日
localtime = time.strftime('%Y%m%d', time.localtime(time.time()))
#创建一个文件,文件名前面带上年月日
f = open(ROOTDIR + localtime + '_houseList.txt', 'a+', encoding='utf-8')
#解析所有的页面
#for k in range(1, int(pageNum) + 1):
for k in range(1, 115):
parseHouseListToFile(ROOTDIR + "houseList_pageNo" + str(k) + ".txt", f)
#关闭文件
f.close()
f = open(ROOTDIR + localtime + '_houseList.txt', encoding='utf-8')
fd = open(ROOTDIR + localtime + '_houseInfo.txt', 'w', encoding='utf-8')
k = 0
for line in f:
data = line.strip('\n')
data = data.split('$')
idx = data[3]
getHouseInfoByPageno(idx, k)
houseInfo = parseHouseInfo(ROOTDIR + "houseInfo_pageNo" + str(idx) + ".html")
print(str(k) + "$".join(data) + '$' + "$".join(houseInfo))
fd.write("$".join(data) + '$' + "$".join(houseInfo) + '\n')
k += 1
f.close()
fd.close()
- 读取csv文件
with open('job.csv', 'r') as f:
reader = csv.reader(f)
for row in reader:
print(row)
- 写入csv文件
#创建CSV文件并写入第一行
def createCsv(file):
if not os.path.exists(file):
csvfile = open(file, 'a+', encoding='utf-8', newline='')
writer = csv.writer(csvfile)
writer.writerow(paramname)
else:
csvfile = open(file, 'a+', newline='')
writer = csv.writer(csvfile)
return writer
- python调用JAVA
import sys
import jpype
name = sys.argv[1]
jarpath = '/home/dsadm/why/python'
jpype.startJVM(jpype.getDefaultJVMPath(), "-Djava.ext.dirs=%s" % jarpath)
DECRYPT = jpype.JClass('why.fmrt.decrypt.DECRYPT')
upperName =DECRYPT.decrypt(name)
print(upperName)
jpype.shutdownJVM()
- 简单验证码破解
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup
import subprocess
import requests
from PIL import Image
from PIL import ImageOps
def cleanImage(imagePath):
image = Image.open(imagePath)
image = image.point(lambda x: 0 if x<143 else 255)
borderImage = ImageOps.expand(image,border=20,fill='white')
borderImage.save(imagePath)
html = urlopen("http://www.pythonscraping.com/humans-only")
bsObj = BeautifulSoup(html, "html.parser")
#Gather prepopulated form values
imageLocation = bsObj.find("img", {"title": "Image CAPTCHA"})["src"]
formBuildId = bsObj.find("input", {"name":"form_build_id"})["value"]
captchaSid = bsObj.find("input", {"name":"captcha_sid"})["value"]
captchaToken = bsObj.find("input", {"name":"captcha_token"})["value"]
captchaUrl = "http://pythonscraping.com"+imageLocation
urlretrieve(captchaUrl, "captcha.jpg")
cleanImage("captcha.jpg")
p = subprocess.Popen(["tesseract", "captcha.jpg", "captcha"], stdout=
subprocess.PIPE,stderr=subprocess.PIPE)
p.wait()
f = open("captcha.txt", "r")
#Clean any whitespace characters
captchaResponse = f.read().replace(" ", "").replace("\n", "")
print("Captcha solution attempt: "+captchaResponse)
if len(captchaResponse) == 5:
params = {"captcha_token":captchaToken, "captcha_sid":captchaSid,
"form_id":"comment_node_page_form", "form_build_id": formBuildId,
"captcha_response":captchaResponse, "name":"Ryan Mitchell",
"subject": "I come to seek the Grail",
"comment_body[und][0][value]":
"...and I am definitely not a bot"}
r = requests.post("http://www.pythonscraping.com/comment/reply/10",
data=params)
responseObj = BeautifulSoup(r.text)
if responseObj.find("div", {"class":"messages"}) is not None:
print(responseObj.find("div", {"class":"messages"}).get_text())
else:
print("There was a problem reading the CAPTCHA correctly!")
- 滑块验证码破解
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
import PIL.Image as image
import time,re, random
import requests
try:
from StringIO import StringIO
except ImportError:
from io import StringIO
#爬虫模拟的浏览器头部信息
agent = 'Mozilla/5.0 (Windows NT 5.1; rv:33.0) Gecko/20100101 Firefox/33.0'
headers = {
'User-Agent': agent
}
# 根据位置对图片进行合并还原
# filename:图片
# location_list:图片位置
#内部两个图片处理函数的介绍
#crop函数带的参数为(起始点的横坐标,起始点的纵坐标,宽度,高度)
#paste函数的参数为(需要修改的图片,粘贴的起始点的横坐标,粘贴的起始点的纵坐标)
def get_merge_image(filename,location_list):
#打开图片文件
im = image.open(filename)
#创建新的图片,大小为260*116
new_im = image.new('RGB', (260,116))
im_list_upper=[]
im_list_down=[]
# 拷贝图片
for location in location_list:
#上面的图片
if location['y']==-58:
im_list_upper.append(im.crop((abs(location['x']),58,abs(location['x'])+10,166)))
#下面的图片
if location['y']==0:
im_list_down.append(im.crop((abs(location['x']),0,abs(location['x'])+10,58)))
new_im = image.new('RGB', (260,116))
x_offset = 0
#黏贴图片
for im in im_list_upper:
new_im.paste(im, (x_offset,0))
x_offset += im.size[0]
x_offset = 0
for im in im_list_down:
new_im.paste(im, (x_offset,58))
x_offset += im.size[0]
return new_im
#下载并还原图片
# driver:webdriver
# div:图片的div
def get_image(driver,div):
#找到图片所在的div
background_images=driver.find_elements_by_xpath(div)
location_list=[]
imageurl=''
#图片是被CSS按照位移的方式打乱的,我们需要找出这些位移,为后续还原做好准备
for background_image in background_images:
location={}
#在html里面解析出小图片的url地址,还有长高的数值
location['x']=int(re.findall("background-image: url\(\"(.*)\"\); background-position: (.*)px (.*)px;",background_image.get_attribute('style'))[0][1])
location['y']=int(re.findall("background-image: url\(\"(.*)\"\); background-position: (.*)px (.*)px;",background_image.get_attribute('style'))[0][2])
imageurl=re.findall("background-image: url\(\"(.*)\"\); background-position: (.*)px (.*)px;",background_image.get_attribute('style'))[0][0]
location_list.append(location)
#替换图片的后缀,获得图片的URL
imageurl=imageurl.replace("webp","jpg")
#获得图片的名字
imageName = imageurl.split('/')[-1]
#获得图片
session = requests.session()
r = session.get(imageurl, headers = headers, verify = False)
#下载图片
with open(imageName, 'wb') as f:
f.write(r.content)
f.close()
#重新合并还原图片
image=get_merge_image(imageName, location_list)
return image
#对比RGB值
def is_similar(image1,image2,x,y):
pass
#获取指定位置的RGB值
pixel1=image1.getpixel((x,y))
pixel2=image2.getpixel((x,y))
for i in range(0,3):
# 如果相差超过50则就认为找到了缺口的位置
if abs(pixel1[i]-pixel2[i])>=50:
return False
return True
#计算缺口的位置
def get_diff_location(image1,image2):
i=0
# 两张原始图的大小都是相同的260*116
# 那就通过两个for循环依次对比每个像素点的RGB值
# 如果相差超过50则就认为找到了缺口的位置
for i in range(0,260):
for j in range(0,116):
if is_similar(image1,image2,i,j)==False:
return i
#根据缺口的位置模拟x轴移动的轨迹
def get_track(length):
pass
list=[]
#间隔通过随机范围函数来获得,每次移动一步或者两步
x=random.randint(1,3)
#生成轨迹并保存到list内
while length-x>=5:
list.append(x)
length=length-x
x=random.randint(1,3)
#最后五步都是一步步移动
for i in range(length):
list.append(1)
return list
#滑动验证码破解程序
def main():
#打开火狐浏览器
driver = webdriver.Firefox()
#用火狐浏览器打开网页
driver.get("http://www.geetest.com/exp_embed")
#等待页面的上元素刷新出来
WebDriverWait(driver, 30).until(lambda the_driver: the_driver.find_element_by_xpath("//div[@class='gt_slider_knob gt_show']").is_displayed())
WebDriverWait(driver, 30).until(lambda the_driver: the_driver.find_element_by_xpath("//div[@class='gt_cut_bg gt_show']").is_displayed())
WebDriverWait(driver, 30).until(lambda the_driver: the_driver.find_element_by_xpath("//div[@class='gt_cut_fullbg gt_show']").is_displayed())
#下载图片
image1=get_image(driver, "//div[@class='gt_cut_bg gt_show']/div")
image2=get_image(driver, "//div[@class='gt_cut_fullbg gt_show']/div")
#计算缺口位置
loc=get_diff_location(image1, image2)
#生成x的移动轨迹点
track_list=get_track(loc)
#找到滑动的圆球
element=driver.find_element_by_xpath("//div[@class='gt_slider_knob gt_show']")
location=element.location
#获得滑动圆球的高度
y=location['y']
#鼠标点击元素并按住不放
print ("第一步,点击元素")
ActionChains(driver).click_and_hold(on_element=element).perform()
time.sleep(0.15)
print ("第二步,拖动元素")
track_string = ""
for track in track_list:
#不能移动太快,否则会被认为是程序执行
track_string = track_string + "{%d,%d}," % (track, y - 445)
#xoffset=track+22:这里的移动位置的值是相对于滑动圆球左上角的相对值,而轨迹变量里的是圆球的中心点,所以要加上圆球长度的一半。
#yoffset=y-445:这里也是一样的。不过要注意的是不同的浏览器渲染出来的结果是不一样的,要保证最终的计算后的值是22,也就是圆球高度的一半
ActionChains(driver).move_to_element_with_offset(to_element=element, xoffset=track+22, yoffset=y-445).perform()
#间隔时间也通过随机函数来获得,间隔不能太快,否则会被认为是程序执行
time.sleep(random.randint(10,50)/100)
print (track_string)
#xoffset=21,本质就是向后退一格。这里退了5格是因为圆球的位置和滑动条的左边缘有5格的距离
ActionChains(driver).move_to_element_with_offset(to_element=element, xoffset=21, yoffset=y-445).perform()
time.sleep(0.1)
ActionChains(driver).move_to_element_with_offset(to_element=element, xoffset=21, yoffset=y-445).perform()
time.sleep(0.1)
ActionChains(driver).move_to_element_with_offset(to_element=element, xoffset=21, yoffset=y-445).perform()
time.sleep(0.1)
ActionChains(driver).move_to_element_with_offset(to_element=element, xoffset=21, yoffset=y-445).perform()
time.sleep(0.1)
ActionChains(driver).move_to_element_with_offset(to_element=element, xoffset=21, yoffset=y-445).perform()
print ("第三步,释放鼠标")
#释放鼠标
ActionChains(driver).release(on_element=element).perform()
time.sleep(3)
#点击验证
# submit = driver.find_element_by_xpath("//div[@class='gt_ajax_tip success']")
# print(submit.location)
# time.sleep(5)
#关闭浏览器,为了演示方便,暂时注释掉.
#driver.quit()
#主函数入口
if __name__ == '__main__':
pass
main()
- python构建web页面
import os
import tornado.httpserver
import tornado.ioloop
import tornado.options
import tornado.web
from view import *
from tornado.options import define, options
define("port", default=8000, help="run on the given port", type=int)
class Application(tornado.web.Application):
def __init__(self):
handlers = [
(r"/", Indexhandler),
]
settings = dict(
template_path=os.path.join(os.path.dirname(__file__), 'templates'),
autoescape=None,
debug=False,
)
tornado.web.Application.__init__(self, handlers, **settings)
if __name__ == "__main__":
tornado.options.parse_command_line()
http_server = tornado.httpserver.HTTPServer(Application(), xheaders=True)
http_server.listen(options.port)
tornado.ioloop.IOLoop.instance().start()
- 定时任务
#! /usr/bin/env python
# coding=utf-8
import time, os, sched
# 第一个参数确定任务的时间,返回从某个特定的时间到现在经历的秒数
# 第二个参数以某种人为的方式衡量时间
schedule = sched.scheduler(time.time, time.sleep)
def perform_command(cmd, inc):
# 安排inc秒后再次运行自己,即周期运行
schedule.enter(inc, 0, perform_command, (cmd, inc))
os.system(cmd)
def timming_exe(cmd, inc=60):
# enter用来安排某事件的发生时间,从现在起第n秒开始启动
schedule.enter(inc, 0, perform_command, (cmd, inc))
# 持续运行,直到计划时间队列变成空为止
schedule.run()
#每隔一天调用getMovieList.py程序
timming_exe("getMovieList.py", 60 * 60 * 24)
- 通过百度地图API,标准化地址
from urllib.request import urlopen
from urllib.parse import urlencode
from urllib.error import URLError
import json
class xBaiduMap:
def __init__(self, key='mgf2Gxr7EgnfPVQnpClZnsug'):
self.host = 'http://api.map.baidu.com'
self.path = '/geocoder?'
self.param = {'address': None, 'output': 'json', 'key': key, 'location': None, 'city': None}
def getLocation(self, address, city=None):
rlt = self.geocoding('address', address, city)
if rlt != None:
l = rlt['result']
if isinstance(l, list):
return None
return l['location']['lat'], l['location']['lng']
def getAddress(self, lat, lng):
rlt = self.geocoding('location', "{0},{1}".format(lat, lng))
if rlt != None:
l = rlt['result']
#return l['formatted_address']
# Here you can get more details about the location with 'addressComponent' key
ld=rlt['result']['addressComponent']
return (ld['city']+';'+ld['district']+';'+ld['street']+";"+ld['street_number'])
def geocoding(self, key, value, city=None):
if key == 'location':
if 'city' in self.param:
del self.param['city']
if 'address' in self.param:
del self.param['address']
elif key == 'address':
if 'location' in self.param:
del self.param['location']
if city == None and 'city' in self.param:
del self.param['city']
else:
self.param['city'] = city
self.param[key] = value
try:
r = urlopen(self.host + self.path + urlencode(self.param)).read()
except URLError:
print ("URLError")
return None
str_response = r.decode('utf-8')
rlt = json.loads(str_response)
if rlt['status'] == 'OK':
return rlt
else:
print ("Decoding Failed")
return None
- 多进程
import multiprocessing
for process_id in range(PROCESS_NUM):
p = multiprocessing.Process(target=worker, args=(process_id,))
jobs.append(p)
p.start()
- 文件切割小程序
def split_file(file_name, file_num):
#文件已经存在
if(os.path.exists("split_0.txt")):
return
#统计文件的总行数
count = -1
file = open(file_name, encoding='utf-8')
for count, line in enumerate(file):
pass
count += 1
file.close()
#每个文件的行数
count_per_file = count / file_num
#创建file_num个新文件
for i in range(file_num):
file = open("split_" + str(i) + ".txt", 'w', encoding='utf-8')
file.close()
#分割成file_num个新文件
file = open(file_name, encoding='utf-8')
count = -1
for count, line in enumerate(file):
file_index = (int)(count /count_per_file)
sub_file = open("split_" + str(file_index) + ".txt", "a+", encoding='utf-8')
if(sub_file != None):
sub_file.write(line)
- python操作DB2
import ibm_db
con = ibm_db.connect("DATABASE=FMRT;HOSTNAME=XX.XX.XX.XX;PORT=60000;PORTOCOL=TCPIP;UID=db2inst1;PWD=db2inst1;", "", "")
sql = getSql(inputfile)
stmt = ibm_db.exec_immediate(con, sql)
result = ibm_db.fetch_both(stmt)
rowidx = 0
while (result):
#DO SOMETHING
result = ibm_db.fetch_both(stmt)
ibm_db.close(con)
- jieba中文分词
import jieba
seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
for line in seg_list:
print(line)
print("Full Mode: " + "/ ".join(seg_list)) # 全模式
seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
print("Default Mode: " + "/ ".join(seg_list)) # 精确模式
seg_list = jieba.cut("他来到了网易杭研大厦") # 默认是精确模式
print(", ".join(seg_list))
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式
print(", ".join(seg_list))
- 月末判断
import calendar
import sys
def isMonthEnd(datetime):
year = int(datetime[0:4])
month = int(datetime[4:6])
day = int(datetime[6:8])
wday, monthrange = calendar.monthrange(year, month)
if(day == monthrange):
return 1
else:
return 0
isMonthEnd(sys.argv[1])
- 移除中文分隔符
cmd = "sed ':a;N;$ s/\\r\\n//g;ba' " + oldfile + " > " + newfile
os.system(cmd)
- 多线程
# -*- coding: utf-8 -*-
"""
thread
~~~~~~~~~~~~~~~~
Thread framework
:copyright: (c) 2016 by why.
:license: MIT, see LICENSE for more details.
"""
import threading
class Threadconfig():
def __init__(self, thread_size):
self.thread_size = thread_size
def topen(self):
self.thread_tasks = []
def build(self, func, **kwargs):
self.thread_task = threading.Thread(target=func, kwargs=(kwargs))
self.thread_tasks.append(self.thread_task)
def run(self):
for thread_task in self.thread_tasks:
thread_task.setDaemon(True)
thread_task.start()
while 1:
alive = False
for thread_num in range(0, self.thread_size):
alive = alive or self.thread_tasks[thread_num].isAlive()
if not alive:
break
def __del__(self):
self.thread_tasks = []
- python 安装wheel
pip install *.wheel