通知设置 新通知
模拟登录网易163失败
python爬虫 • xiaoai 回复了问题 • 2 人关注 • 2 个回复 • 5463 次浏览 • 2020-06-28 14:25
深圳住房公积金验证码 识别破解
python爬虫 • 李魔佛 发表了文章 • 0 个评论 • 2929 次浏览 • 2020-06-26 14:34
http://gjj.sz.gov.cn/fzgn/zfcq/index.html
比较常规的验证码,使用keras全连接层,cv切割后每个字符只需要20个样本就达到准确率99%。
需要模型或者代码的私聊。 查看全部
http://gjj.sz.gov.cn/fzgn/zfcq/index.html
比较常规的验证码,使用keras全连接层,cv切割后每个字符只需要20个样本就达到准确率99%。
需要模型或者代码的私聊。
PyQt5自定义控件
李魔佛 发表了文章 • 0 个评论 • 3341 次浏览 • 2020-06-13 23:14
Burning widget(烧录控件)
这个控件可能会在Nero,K3B或其他CD/DVD烧录软件中见到。
# -*- coding: utf-8 -*-
"""
PyQt5 tutorial
In this example, we create a custom widget.
"""
import sys
from PyQt5.QtWidgets import (QWidget, QSlider, QApplication,
QHBoxLayout, QVBoxLayout)
from PyQt5.QtCore import QObject, Qt, pyqtSignal
from PyQt5.QtGui import QPainter, QFont, QColor, QPen
class Communicate(QObject):
updateBW = pyqtSignal(int)
class BurningWidget(QWidget):
def __init__(self):
super().__init__()
self.initUI()
def initUI(self):
self.setMinimumSize(1, 30)
self.value = 75
self.num = [75, 150, 225, 300, 375, 450, 525, 600, 675]
def setValue(self, value):
self.value = value
def paintEvent(self, e):
qp = QPainter()
qp.begin(self)
self.drawWidget(qp)
qp.end()
def drawWidget(self, qp):
font = QFont('Serif', 7, QFont.Light)
qp.setFont(font)
size = self.size()
w = size.width()
h = size.height()
step = int(round(w / 10.0))
till = int(((w / 750.0) * self.value))
full = int(((w / 750.0) * 700))
if self.value >= 700:
qp.setPen(QColor(255, 255, 255))
qp.setBrush(QColor(255, 255, 184))
qp.drawRect(0, 0, full, h)
qp.setPen(QColor(255, 175, 175))
qp.setBrush(QColor(255, 175, 175))
qp.drawRect(full, 0, till - full, h)
else:
qp.setPen(QColor(255, 255, 255))
qp.setBrush(QColor(255, 255, 184))
qp.drawRect(0, 0, till, h)
pen = QPen(QColor(20, 20, 20), 1,
Qt.SolidLine)
qp.setPen(pen)
qp.setBrush(Qt.NoBrush)
qp.drawRect(0, 0, w - 1, h - 1)
j = 0
for i in range(step, 10 * step, step):
qp.drawLine(i, 0, i, 5)
metrics = qp.fontMetrics()
fw = metrics.width(str(self.num[j]))
qp.drawText(i - fw / 2, h / 2, str(self.num[j]))
j = j + 1
class Example(QWidget):
def __init__(self):
super().__init__()
self.initUI()
def initUI(self):
sld = QSlider(Qt.Horizontal, self)
sld.setFocusPolicy(Qt.NoFocus)
sld.setRange(1, 750)
sld.setValue(75)
sld.setGeometry(30, 40, 150, 30)
self.c = Communicate()
self.wid = BurningWidget()
self.c.updateBW[int].connect(self.wid.setValue)
sld.valueChanged[int].connect(self.changeValue)
hbox = QHBoxLayout()
hbox.addWidget(self.wid)
vbox = QVBoxLayout()
vbox.addStretch(1)
vbox.addLayout(hbox)
self.setLayout(vbox)
self.setGeometry(300, 300, 390, 210)
self.setWindowTitle('Burning widget')
self.show()
def changeValue(self, value):
self.c.updateBW.emit(value)
self.wid.repaint()
if __name__ == '__main__':
app = QApplication(sys.argv)
ex = Example()
sys.exit(app.exec_())
在示例中我们使用了滑块与一个自定义控件。自定义控件受滑块控制。控件显示了媒体介质的容量和剩余空间。该控件的最小值为1,最大值为750。在值超过700时颜色变为红色。这通常意味着超刻(即实际写入光盘的容量超过刻录盘片官方标称容量的一种操作)。
BurningWidget控件通过QHBoxLayout与QVBoxLayout置于窗体的底部。
class BurningWidget(QWidget):
def __init__(self):
super().__init__()
烧录的控件,它基于QWidget
self.setMinimumSize(1, 30)我们改变了控件的最小大小(高度),默认值为有点小。
font = QFont('Serif', 7, QFont.Light)
qp.setFont(font)我们使用一个比默认要小的字体。
size = self.size()
w = size.width()
h = size.height()
step = int(round(w / 10.0))
till = int(((w / 750.0) * self.value))
full = int(((w / 750.0) * 700))
控件采用了动态绘制技术。窗体越大,控件也随之变大;反之亦然。这也是我们需要计算自定义控件的载体控件(即窗体)尺寸的原因。till参数定义了需要绘制的总尺寸,它根据slider控件计算得出,是整体区域的比例值。full参数定义了红色区域的绘制起点。注意在绘制时为取得较大精度而使用的浮点数运算。
实际的绘制分三个步骤。黄色或红黄矩形的绘制,然后是刻度线的绘制,最后是刻度值的绘制。
metrics = qp.fontMetrics()
fw = metrics.width(str(self.num[j]))
qp.drawText(i-fw/2, h/2, str(self.num[j]))我们使用字体度量来绘制文本。我们必须知道文本的宽度,以中心垂直线。
def changeValue(self, value):
self.c.updateBW.emit(value)
self.wid.repaint()当滑块发生移动时,changeValue()方法会被调用。在方法内我们触发了一个自定义的updateBW信号,其参数是当前滚动条的值。该值被用于计算Burning widget的容量值。然后对控件进行重绘。
查看全部
Burning widget(烧录控件)
这个控件可能会在Nero,K3B或其他CD/DVD烧录软件中见到。
# -*- coding: utf-8 -*-
"""
PyQt5 tutorial
In this example, we create a custom widget.
"""
import sys
from PyQt5.QtWidgets import (QWidget, QSlider, QApplication,
QHBoxLayout, QVBoxLayout)
from PyQt5.QtCore import QObject, Qt, pyqtSignal
from PyQt5.QtGui import QPainter, QFont, QColor, QPen
class Communicate(QObject):
updateBW = pyqtSignal(int)
class BurningWidget(QWidget):
def __init__(self):
super().__init__()
self.initUI()
def initUI(self):
self.setMinimumSize(1, 30)
self.value = 75
self.num = [75, 150, 225, 300, 375, 450, 525, 600, 675]
def setValue(self, value):
self.value = value
def paintEvent(self, e):
qp = QPainter()
qp.begin(self)
self.drawWidget(qp)
qp.end()
def drawWidget(self, qp):
font = QFont('Serif', 7, QFont.Light)
qp.setFont(font)
size = self.size()
w = size.width()
h = size.height()
step = int(round(w / 10.0))
till = int(((w / 750.0) * self.value))
full = int(((w / 750.0) * 700))
if self.value >= 700:
qp.setPen(QColor(255, 255, 255))
qp.setBrush(QColor(255, 255, 184))
qp.drawRect(0, 0, full, h)
qp.setPen(QColor(255, 175, 175))
qp.setBrush(QColor(255, 175, 175))
qp.drawRect(full, 0, till - full, h)
else:
qp.setPen(QColor(255, 255, 255))
qp.setBrush(QColor(255, 255, 184))
qp.drawRect(0, 0, till, h)
pen = QPen(QColor(20, 20, 20), 1,
Qt.SolidLine)
qp.setPen(pen)
qp.setBrush(Qt.NoBrush)
qp.drawRect(0, 0, w - 1, h - 1)
j = 0
for i in range(step, 10 * step, step):
qp.drawLine(i, 0, i, 5)
metrics = qp.fontMetrics()
fw = metrics.width(str(self.num[j]))
qp.drawText(i - fw / 2, h / 2, str(self.num[j]))
j = j + 1
class Example(QWidget):
def __init__(self):
super().__init__()
self.initUI()
def initUI(self):
sld = QSlider(Qt.Horizontal, self)
sld.setFocusPolicy(Qt.NoFocus)
sld.setRange(1, 750)
sld.setValue(75)
sld.setGeometry(30, 40, 150, 30)
self.c = Communicate()
self.wid = BurningWidget()
self.c.updateBW[int].connect(self.wid.setValue)
sld.valueChanged[int].connect(self.changeValue)
hbox = QHBoxLayout()
hbox.addWidget(self.wid)
vbox = QVBoxLayout()
vbox.addStretch(1)
vbox.addLayout(hbox)
self.setLayout(vbox)
self.setGeometry(300, 300, 390, 210)
self.setWindowTitle('Burning widget')
self.show()
def changeValue(self, value):
self.c.updateBW.emit(value)
self.wid.repaint()
if __name__ == '__main__':
app = QApplication(sys.argv)
ex = Example()
sys.exit(app.exec_())
在示例中我们使用了滑块与一个自定义控件。自定义控件受滑块控制。控件显示了媒体介质的容量和剩余空间。该控件的最小值为1,最大值为750。在值超过700时颜色变为红色。这通常意味着超刻(即实际写入光盘的容量超过刻录盘片官方标称容量的一种操作)。
BurningWidget控件通过QHBoxLayout与QVBoxLayout置于窗体的底部。
class BurningWidget(QWidget):
def __init__(self):
super().__init__()
烧录的控件,它基于QWidget
self.setMinimumSize(1, 30)我们改变了控件的最小大小(高度),默认值为有点小。
font = QFont('Serif', 7, QFont.Light)我们使用一个比默认要小的字体。
qp.setFont(font)
size = self.size()
w = size.width()
h = size.height()
step = int(round(w / 10.0))
till = int(((w / 750.0) * self.value))
full = int(((w / 750.0) * 700))
控件采用了动态绘制技术。窗体越大,控件也随之变大;反之亦然。这也是我们需要计算自定义控件的载体控件(即窗体)尺寸的原因。till参数定义了需要绘制的总尺寸,它根据slider控件计算得出,是整体区域的比例值。full参数定义了红色区域的绘制起点。注意在绘制时为取得较大精度而使用的浮点数运算。
实际的绘制分三个步骤。黄色或红黄矩形的绘制,然后是刻度线的绘制,最后是刻度值的绘制。
metrics = qp.fontMetrics()我们使用字体度量来绘制文本。我们必须知道文本的宽度,以中心垂直线。
fw = metrics.width(str(self.num[j]))
qp.drawText(i-fw/2, h/2, str(self.num[j]))
def changeValue(self, value):当滑块发生移动时,changeValue()方法会被调用。在方法内我们触发了一个自定义的updateBW信号,其参数是当前滚动条的值。该值被用于计算Burning widget的容量值。然后对控件进行重绘。
self.c.updateBW.emit(value)
self.wid.repaint()
Windows安装pyminizip
李魔佛 发表了文章 • 0 个评论 • 3763 次浏览 • 2020-05-31 19:06
pip install pyminizip
电脑需要安装vc的编译库,或者在其他机子上把pyd文件拷贝到程序的当前目录。
pip install pyminizip
电脑需要安装vc的编译库,或者在其他机子上把pyd文件拷贝到程序的当前目录。
为什么我使用splash中间件得到的response.body和splash上访问的html代码不同
李魔佛 回复了问题 • 1 人关注 • 1 个回复 • 3403 次浏览 • 2020-04-29 00:19
pyqt5 QRect在哪个类
李魔佛 发表了文章 • 0 个评论 • 2679 次浏览 • 2020-04-24 10:45
from PyQt5.QtCore import Qt,QRect
from PyQt5.QtCore import Qt,QRect
请问各位用scrapy和redis方法爬取不到数据的问题(可悬赏),求大佬看下,感激不尽
python爬虫 • 李魔佛 回复了问题 • 2 人关注 • 1 个回复 • 8158 次浏览 • 2020-04-16 22:16
薅“疫情公益”羊毛,黑产恶意爬取各大出版社电子书上万册
python爬虫 • Magiccc 发表了文章 • 0 个评论 • 3070 次浏览 • 2020-02-26 13:17
requests请求返回的json格式为bytes乱码
python爬虫 • 李魔佛 回复了问题 • 2 人关注 • 1 个回复 • 5097 次浏览 • 2020-02-16 23:35
为什么我这段代码得到的是空列表呢
python爬虫 • 李魔佛 回复了问题 • 2 人关注 • 1 个回复 • 3554 次浏览 • 2020-02-09 12:47
socketio中client的sio wait用法
李魔佛 发表了文章 • 1 个评论 • 4934 次浏览 • 2020-01-08 20:30
import time
import socketio
sio = socketio.Client()
start_timer = None
def send_ping():
global start_timer
start_timer = time.time()
sio.emit('ping_from_client')
@sio.event
def connect():
print('connected to server')
send_ping()
@sio.event
def pong_from_server(data):
global start_timer
latency = time.time() - start_timer
print('latency is {0:.2f} ms'.format(latency * 1000))
sio.sleep(1)
send_ping()
if __name__ == '__main__':
sio.connect('http://localhost:5000')
sio.wait()
print('next')
比如上述代码中,如果调用了sio.wait() , 那么next是不会被打印的。
如果注释掉后,那么next就可以正常被打印。 查看全部
import time
import socketio
sio = socketio.Client()
start_timer = None
def send_ping():
global start_timer
start_timer = time.time()
sio.emit('ping_from_client')
@sio.event
def connect():
print('connected to server')
send_ping()
@sio.event
def pong_from_server(data):
global start_timer
latency = time.time() - start_timer
print('latency is {0:.2f} ms'.format(latency * 1000))
sio.sleep(1)
send_ping()
if __name__ == '__main__':
sio.connect('http://localhost:5000')
sio.wait()
print('next')
比如上述代码中,如果调用了sio.wait() , 那么next是不会被打印的。
如果注释掉后,那么next就可以正常被打印。
jieba.posseg TypeError: cannot unpack non-iterable pair object 词性分析报错
李魔佛 发表了文章 • 0 个评论 • 4380 次浏览 • 2019-11-23 10:12
例子:import jieba.posseg as pseg
seg_list = pseg.cut("我爱北京天安门")
for word,flag in seg_list:
print(word)
print(flag)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-5-f105f6980f88> in <module>()
1 import jieba.posseg as pseg
2 seg_list = pseg.cut("我爱北京天安门")
----> 3 for word,flag in seg_list:
4 print(word)
5 print(flag)
TypeError: cannot unpack non-iterable pair object原因是新版本中seg_list是一个生成器,所以只能 for win seg_list然后从word中解包出来
print(w.word)
print(w.flag)
这样问题就解决了。 查看全部
例子:
import jieba.posseg as pseg
seg_list = pseg.cut("我爱北京天安门")
for word,flag in seg_list:
print(word)
print(flag)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-5-f105f6980f88> in <module>()
1 import jieba.posseg as pseg
2 seg_list = pseg.cut("我爱北京天安门")
----> 3 for word,flag in seg_list:
4 print(word)
5 print(flag)
TypeError: cannot unpack non-iterable pair object
原因是新版本中seg_list是一个生成器,所以只能 for win seg_list
然后从word中解包出来
print(w.word)
print(w.flag)
这样问题就解决了。
scrapy在settings中定义变量不能包含小写!
python爬虫 • 李魔佛 发表了文章 • 0 个评论 • 2936 次浏览 • 2019-11-16 16:39
比如定义了一个叫 Redis_host = '192.168.1.1',的值
然后在spider中,如果你调用self.settings.get('Redis_host')
那么返回值是 None。
如果用REDIS_HOST定义,那么就可以正确返回它的值。
如果你一定要用小写,也有其他方法可正常调用。
先导入settings文件
fromt xxxx import setttings # xxx为项目名
host = settings.Redis_host # 直接导入一个文件的形式来调用是可以的 查看全部
比如定义了一个叫 Redis_host = '192.168.1.1',的值
然后在spider中,如果你调用self.settings.get('Redis_host')
那么返回值是 None。
如果用REDIS_HOST定义,那么就可以正确返回它的值。
如果你一定要用小写,也有其他方法可正常调用。
先导入settings文件
fromt xxxx import setttings # xxx为项目名
host = settings.Redis_host # 直接导入一个文件的形式来调用是可以的
etree.strip_tags的用法
python爬虫 • 李魔佛 发表了文章 • 0 个评论 • 4175 次浏览 • 2019-10-24 11:24
它把参数中的标签从源htmlelement中删除,并且把里面的标签文本给合并进来。
举个例子:from lxml.html import etree
from lxml.html import fromstring, HtmlElement
test_html = '''<p><span>hello</span><span>world</span></p>'''
test_element = fromstring(test_html)
etree.strip_tags(test_element,'span') # 清除span标签
etree.tostring(test_element)
因为上述操作直接应用于test_element上的,所以test_element的值已经被修改了。
所以现在test_element 的值是
b'<p>helloworld</p>'
原创文章,转载请注明出处
http://30daydo.com/article/553
查看全部
它把参数中的标签从源htmlelement中删除,并且把里面的标签文本给合并进来。
举个例子:
from lxml.html import etree
from lxml.html import fromstring, HtmlElement
test_html = '''<p><span>hello</span><span>world</span></p>'''
test_element = fromstring(test_html)
etree.strip_tags(test_element,'span') # 清除span标签
etree.tostring(test_element)
因为上述操作直接应用于test_element上的,所以test_element的值已经被修改了。
所以现在test_element 的值是
b'<p>helloworld</p>'
原创文章,转载请注明出处
http://30daydo.com/article/553
mumu模拟器adb无法识别
python爬虫 • 李魔佛 发表了文章 • 0 个评论 • 5244 次浏览 • 2019-10-17 08:41
<Forwarding name="ADB_PORT" proto="1" hostip="127.0.0.1" hostport="7555" guestport="5555"/>
在mumu浏览器里面可以看到这个配置信息。
adb connect 127.0.0.1:7555
然后adb shell 就可以了。
配置文件名是:myandrovm_vbox86.nemu 查看全部
<Forwarding name="ADB_PORT" proto="1" hostip="127.0.0.1" hostport="7555" guestport="5555"/>
在mumu浏览器里面可以看到这个配置信息。
adb connect 127.0.0.1:7555
然后adb shell 就可以了。
配置文件名是:myandrovm_vbox86.nemu
aiohttp异步下载图片
python爬虫 • 李魔佛 发表了文章 • 0 个评论 • 4763 次浏览 • 2019-09-16 17:14
headers={'User-Agent':'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
async def getPage(num):
async with aiohttp.ClientSession() as session:
async with session.get(url.format(num),headers=headers) as resp:
if resp.status==200:
f= await aiofiles.open('{}.jpg'.format(num),mode='wb')
await f.write(await resp.read())
await f.close()
loop = asyncio.get_event_loop()
tasks = [getPage(i) for i in range(5)]
loop.run_until_complete(asyncio.wait(tasks))
原创文章,
转载请注明出处:
http://30daydo.com/article/537
查看全部
url = 'http://xyhz.huizhou.gov.cn/static/js/common/jigsaw/images/{}.jpg'
headers={'User-Agent':'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
async def getPage(num):
async with aiohttp.ClientSession() as session:
async with session.get(url.format(num),headers=headers) as resp:
if resp.status==200:
f= await aiofiles.open('{}.jpg'.format(num),mode='wb')
await f.write(await resp.read())
await f.close()
loop = asyncio.get_event_loop()
tasks = [getPage(i) for i in range(5)]
loop.run_until_complete(asyncio.wait(tasks))
原创文章,
转载请注明出处:
http://30daydo.com/article/537
基于文本及符号密度的网页正文提取方法 python实现
李魔佛 发表了文章 • 0 个评论 • 4990 次浏览 • 2019-09-10 15:19
项目路径https://github.com/Rockyzsu/CodePool/tree/master/GeneralNewsExtractor
完成后在本文详细介绍,
请密切关注。 查看全部
项目路径https://github.com/Rockyzsu/CodePool/tree/master/GeneralNewsExtractor
完成后在本文详细介绍,
请密切关注。
python exchange保存备份邮件
李魔佛 发表了文章 • 3 个评论 • 3597 次浏览 • 2019-09-09 10:50
方便自己平时备份邮件。# -*-coding=utf-8-*-
# @Time : 2019/9/9 9:25
# @File : mail_backup.py
# @Author :
import codecs
import re
import config
import os
from exchangelib import DELEGATE, Account, Credentials, Configuration, NTLM, Message, Mailbox, HTMLBody,FileAttachment,ItemAttachment
from exchangelib.protocol import BaseProtocol, NoVerifyHTTPAdapter
#此句用来消除ssl证书错误,exchange使用自签证书需加上
BaseProtocol.HTTP_ADAPTER_CLS = NoVerifyHTTPAdapter
# 输入你的域账号如example\xxx
cred = Credentials(r'example\xxx', 你的邮箱密码)
configx = Configuration(server='mail.credlink.com', credentials=cred, auth_type=NTLM)
a = Account(
primary_smtp_address='你的邮箱地址', config=configx, autodiscover=False, access_type=DELEGATE
)
for item in a.inbox.all().order_by('-datetime_received')[:100]:
print(item.subject, item.sender, item.unique_body,item.datetime_received)
name = item.subject
name = re.sub('[\/:*?"<>|]', '-', name)
local_path = os.path.join('inbox', name+'.html')
with codecs.open(local_path, 'w','utf-8') as f:
f.write(item.unique_body)
for attachment in item.attachments:
if isinstance(attachment, FileAttachment):
name = attachment.name
name = re.sub('[\/:*?"<>|]','-',name)
local_path = os.path.join('inbox', attachment.name)
with codecs.open(local_path, 'wb') as f:
f.write(attachment.content)
print('Saved attachment to', local_path)
elif isinstance(attachment, ItemAttachment):
if isinstance(attachment.item, Message):
name=attachment.item.subject
name = re.sub('[\/:*?"<>|]', '-', name)
local_path = os.path.join('inbox', 'attachment')
with codecs.open(local_path, 'w') as f:
f.write(attachment.item.body)
原创文章,
转载请注明出处
http://30daydo.com/article/534
查看全部
方便自己平时备份邮件。
# -*-coding=utf-8-*-
# @Time : 2019/9/9 9:25
# @File : mail_backup.py
# @Author :
import codecs
import re
import config
import os
from exchangelib import DELEGATE, Account, Credentials, Configuration, NTLM, Message, Mailbox, HTMLBody,FileAttachment,ItemAttachment
from exchangelib.protocol import BaseProtocol, NoVerifyHTTPAdapter
#此句用来消除ssl证书错误,exchange使用自签证书需加上
BaseProtocol.HTTP_ADAPTER_CLS = NoVerifyHTTPAdapter
# 输入你的域账号如example\xxx
cred = Credentials(r'example\xxx', 你的邮箱密码)
configx = Configuration(server='mail.credlink.com', credentials=cred, auth_type=NTLM)
a = Account(
primary_smtp_address='你的邮箱地址', config=configx, autodiscover=False, access_type=DELEGATE
)
for item in a.inbox.all().order_by('-datetime_received')[:100]:
print(item.subject, item.sender, item.unique_body,item.datetime_received)
name = item.subject
name = re.sub('[\/:*?"<>|]', '-', name)
local_path = os.path.join('inbox', name+'.html')
with codecs.open(local_path, 'w','utf-8') as f:
f.write(item.unique_body)
for attachment in item.attachments:
if isinstance(attachment, FileAttachment):
name = attachment.name
name = re.sub('[\/:*?"<>|]','-',name)
local_path = os.path.join('inbox', attachment.name)
with codecs.open(local_path, 'wb') as f:
f.write(attachment.content)
print('Saved attachment to', local_path)
elif isinstance(attachment, ItemAttachment):
if isinstance(attachment.item, Message):
name=attachment.item.subject
name = re.sub('[\/:*?"<>|]', '-', name)
local_path = os.path.join('inbox', 'attachment')
with codecs.open(local_path, 'w') as f:
f.write(attachment.item.body)
原创文章,
转载请注明出处
http://30daydo.com/article/534
性能对比 pypy vs python
李魔佛 发表了文章 • 0 个评论 • 4953 次浏览 • 2019-09-06 17:04
不试不知道,一试吓一跳。
如果是CPU密集型的程序,pypy3的执行速度比python要快上一百倍。
talk is cheap, show me the code!
代码很简单,运行加法运算:
执行2千万次
import time
LOOP = 2*10**8
def add(x,y):
return x+y
def cpu_pressure(loop):
for i in range(loop):
result = add(i,i+1)
if __name__ == '__main__':
start = time.time()
cpu_pressure(LOOP)
print(f'time used {time.time()-start}s')
python执行:
python main.py
返回用时:time used 21.422261476516724s
pypy执行:
pypy main.py
返回用时:time used 0.1925642490386963s
差距真的很大。 查看全部
不试不知道,一试吓一跳。
如果是CPU密集型的程序,pypy3的执行速度比python要快上一百倍。
talk is cheap, show me the code!
代码很简单,运行加法运算:
执行2千万次
import time
LOOP = 2*10**8
def add(x,y):
return x+y
def cpu_pressure(loop):
for i in range(loop):
result = add(i,i+1)
if __name__ == '__main__':
start = time.time()
cpu_pressure(LOOP)
print(f'time used {time.time()-start}s')
python执行:
python main.py
返回用时:time used 21.422261476516724s
pypy执行:
pypy main.py
返回用时:time used 0.1925642490386963s
差距真的很大。
scrapy源码分析<一>:入口函数以及是如何运行
python爬虫 • 李魔佛 发表了文章 • 0 个评论 • 5922 次浏览 • 2019-08-31 10:47
下面我们从源码分析一下scrapy执行的流程:
执行scrapy crawl 命令时,调用的是Command类class Command(ScrapyCommand):
requires_project = True
def syntax(self):
return '[options]'
def short_desc(self):
return 'Runs all of the spiders - My Defined'
def run(self,args,opts):
print('==================')
print(type(self.crawler_process))
spider_list = self.crawler_process.spiders.list() # 找到爬虫类
for name in spider_list:
print('=================')
print(name)
self.crawler_process.crawl(name,**opts.__dict__)
self.crawler_process.start()
然后我们去看看crawler_process,这个是来自ScrapyCommand,而ScrapyCommand又是CrawlerProcess的子类,而CrawlerProcess又是CrawlerRunner的子类
在CrawlerRunner构造函数里面主要作用就是这个 def __init__(self, settings=None):
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
self.settings = settings
self.spider_loader = _get_spider_loader(settings) # 构造爬虫
self._crawlers = set()
self._active = set()
self.bootstrap_failed = False
1. 加载配置文件def _get_spider_loader(settings):
cls_path = settings.get('SPIDER_LOADER_CLASS')
# settings文件没有定义SPIDER_LOADER_CLASS,所以这里获取到的是系统的默认配置文件,
# 默认配置文件在接下来的代码块A
# SPIDER_LOADER_CLASS = 'scrapy.spiderloader.SpiderLoader'
loader_cls = load_object(cls_path)
# 这个函数就是根据路径转为类对象,也就是上面crapy.spiderloader.SpiderLoader 这个
# 字符串变成一个类对象
# 具体的load_object 对象代码见下面代码块B
return loader_cls.from_settings(settings.frozencopy())
默认配置文件defautl_settting.py# 代码块A
#......省略若干
SCHEDULER = 'scrapy.core.scheduler.Scheduler'
SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleLifoDiskQueue'
SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.LifoMemoryQueue'
SCHEDULER_PRIORITY_QUEUE = 'scrapy.pqueues.ScrapyPriorityQueue'
SPIDER_LOADER_CLASS = 'scrapy.spiderloader.SpiderLoader' 就是这个值
SPIDER_LOADER_WARN_ONLY = False
SPIDER_MIDDLEWARES = {}
load_object的实现# 代码块B 为了方便,我把异常处理的去除
from importlib import import_module #导入第三方库
def load_object(path):
dot = path.rindex('.')
module, name = path[:dot], path[dot+1:]
# 上面把路径分为基本路径+模块名
mod = import_module(module)
obj = getattr(mod, name)
# 获取模块里面那个值
return obj
测试代码:In [33]: mod = import_module(module)
In [34]: mod
Out[34]: <module 'scrapy.spiderloader' from '/home/xda/anaconda3/lib/python3.7/site-packages/scrapy/spiderloader.py'>
In [35]: getattr(mod,name)
Out[35]: scrapy.spiderloader.SpiderLoader
In [36]: obj = getattr(mod,name)
In [37]: obj
Out[37]: scrapy.spiderloader.SpiderLoader
In [38]: type(obj)
Out[38]: type
在代码块A中,loader_cls是SpiderLoader,最后返回的的是SpiderLoader.from_settings(settings.frozencopy())
接下来看看SpiderLoader.from_settings, def from_settings(cls, settings):
return cls(settings)
返回类对象自己,所以直接看__init__函数即可class SpiderLoader(object):
"""
SpiderLoader is a class which locates and loads spiders
in a Scrapy project.
"""
def __init__(self, settings):
self.spider_modules = settings.getlist('SPIDER_MODULES')
# 获得settting中的模块名字,创建scrapy的时候就默认帮你生成了
# 你可以看看你的settings文件里面的内容就可以找到这个值,是一个list
self.warn_only = settings.getbool('SPIDER_LOADER_WARN_ONLY')
self._spiders = {}
self._found = defaultdict(list)
self._load_all_spiders() # 加载所有爬虫
核心就是这个_load_all_spiders:
走起:def _load_all_spiders(self):
for name in self.spider_modules:
for module in walk_modules(name): # 这个遍历文件夹里面的文件,然后再转化为类对象,
# 保存到字典:self._spiders = {}
self._load_spiders(module) # 模块变成spider
self._check_name_duplicates() # 去重,如果名字一样就异常
接下来看看_load_spiders
核心就是下面的。def iter_spider_classes(module):
from scrapy.spiders import Spider
for obj in six.itervalues(vars(module)): # 找到模块里面的变量,然后迭代出来
if inspect.isclass(obj) and \
issubclass(obj, Spider) and \
obj.__module__ == module.__name__ and \
getattr(obj, 'name', None): # 有name属性,继承于Spider
yield obj
这个obj就是我们平时写的spider类了。
原来分析了这么多,才找到了我们平时写的爬虫类
待续。。。。
原创文章
转载请注明出处
http://30daydo.com/article/530
查看全部
下面我们从源码分析一下scrapy执行的流程:
执行scrapy crawl 命令时,调用的是Command类
class Command(ScrapyCommand):
requires_project = True
def syntax(self):
return '[options]'
def short_desc(self):
return 'Runs all of the spiders - My Defined'
def run(self,args,opts):
print('==================')
print(type(self.crawler_process))
spider_list = self.crawler_process.spiders.list() # 找到爬虫类
for name in spider_list:
print('=================')
print(name)
self.crawler_process.crawl(name,**opts.__dict__)
self.crawler_process.start()
然后我们去看看crawler_process,这个是来自ScrapyCommand,而ScrapyCommand又是CrawlerProcess的子类,而CrawlerProcess又是CrawlerRunner的子类
在CrawlerRunner构造函数里面主要作用就是这个
def __init__(self, settings=None):
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
self.settings = settings
self.spider_loader = _get_spider_loader(settings) # 构造爬虫
self._crawlers = set()
self._active = set()
self.bootstrap_failed = False
1. 加载配置文件
def _get_spider_loader(settings):
cls_path = settings.get('SPIDER_LOADER_CLASS')
# settings文件没有定义SPIDER_LOADER_CLASS,所以这里获取到的是系统的默认配置文件,
# 默认配置文件在接下来的代码块A
# SPIDER_LOADER_CLASS = 'scrapy.spiderloader.SpiderLoader'
loader_cls = load_object(cls_path)
# 这个函数就是根据路径转为类对象,也就是上面crapy.spiderloader.SpiderLoader 这个
# 字符串变成一个类对象
# 具体的load_object 对象代码见下面代码块B
return loader_cls.from_settings(settings.frozencopy())
默认配置文件defautl_settting.py
# 代码块A
#......省略若干
SCHEDULER = 'scrapy.core.scheduler.Scheduler'
SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleLifoDiskQueue'
SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.LifoMemoryQueue'
SCHEDULER_PRIORITY_QUEUE = 'scrapy.pqueues.ScrapyPriorityQueue'
SPIDER_LOADER_CLASS = 'scrapy.spiderloader.SpiderLoader' 就是这个值
SPIDER_LOADER_WARN_ONLY = False
SPIDER_MIDDLEWARES = {}
load_object的实现
# 代码块B 为了方便,我把异常处理的去除
from importlib import import_module #导入第三方库
def load_object(path):
dot = path.rindex('.')
module, name = path[:dot], path[dot+1:]
# 上面把路径分为基本路径+模块名
mod = import_module(module)
obj = getattr(mod, name)
# 获取模块里面那个值
return obj
测试代码:
In [33]: mod = import_module(module)
In [34]: mod
Out[34]: <module 'scrapy.spiderloader' from '/home/xda/anaconda3/lib/python3.7/site-packages/scrapy/spiderloader.py'>
In [35]: getattr(mod,name)
Out[35]: scrapy.spiderloader.SpiderLoader
In [36]: obj = getattr(mod,name)
In [37]: obj
Out[37]: scrapy.spiderloader.SpiderLoader
In [38]: type(obj)
Out[38]: type
在代码块A中,loader_cls是SpiderLoader,最后返回的的是SpiderLoader.from_settings(settings.frozencopy())
接下来看看SpiderLoader.from_settings,
def from_settings(cls, settings):
return cls(settings)
返回类对象自己,所以直接看__init__函数即可
class SpiderLoader(object):
"""
SpiderLoader is a class which locates and loads spiders
in a Scrapy project.
"""
def __init__(self, settings):
self.spider_modules = settings.getlist('SPIDER_MODULES')
# 获得settting中的模块名字,创建scrapy的时候就默认帮你生成了
# 你可以看看你的settings文件里面的内容就可以找到这个值,是一个list
self.warn_only = settings.getbool('SPIDER_LOADER_WARN_ONLY')
self._spiders = {}
self._found = defaultdict(list)
self._load_all_spiders() # 加载所有爬虫
核心就是这个_load_all_spiders:
走起:
def _load_all_spiders(self):
for name in self.spider_modules:
for module in walk_modules(name): # 这个遍历文件夹里面的文件,然后再转化为类对象,
# 保存到字典:self._spiders = {}
self._load_spiders(module) # 模块变成spider
self._check_name_duplicates() # 去重,如果名字一样就异常
接下来看看_load_spiders
核心就是下面的。
def iter_spider_classes(module):
from scrapy.spiders import Spider
for obj in six.itervalues(vars(module)): # 找到模块里面的变量,然后迭代出来
if inspect.isclass(obj) and \
issubclass(obj, Spider) and \
obj.__module__ == module.__name__ and \
getattr(obj, 'name', None): # 有name属性,继承于Spider
yield obj
这个obj就是我们平时写的spider类了。
原来分析了这么多,才找到了我们平时写的爬虫类
待续。。。。
原创文章
转载请注明出处
http://30daydo.com/article/530
anaconda环境下无法启动jupyter notebook
李魔佛 发表了文章 • 0 个评论 • 7280 次浏览 • 2019-08-19 17:16
报错: from . import (constants, error, message, context,
ImportError: DLL load failed: 找不到指定的模块。
但是可以直接在Anaconda navigator中直接启动,所以判断是环境问题。
切换到anaconda的虚拟环境,(在菜单中进入anaconda prompt command),在当前命令行下执行 jupyter notebook就能够正常运行。
查看全部
报错:
from . import (constants, error, message, context,
ImportError: DLL load failed: 找不到指定的模块。
但是可以直接在Anaconda navigator中直接启动,所以判断是环境问题。
切换到anaconda的虚拟环境,(在菜单中进入anaconda prompt command),在当前命令行下执行 jupyter notebook就能够正常运行。
random.randint的用法
李魔佛 发表了文章 • 0 个评论 • 13162 次浏览 • 2019-08-01 16:31
from random import randint
randint(0,1)
Out[25]: 1
randint(0,1)
Out[26]: 1
randint(0,1)
Out[27]: 1
randint(0,1)
Out[28]: 1
randint(0,1)
Out[29]: 0
randint(0,1)
Out[30]: 1
random.randint(a,b)
输出的整数范围包含a和b,和之间的整数
查看全部
from random import randint
randint(0,1)
Out[25]: 1
randint(0,1)
Out[26]: 1
randint(0,1)
Out[27]: 1
randint(0,1)
Out[28]: 1
randint(0,1)
Out[29]: 0
randint(0,1)
Out[30]: 1
random.randint(a,b)
输出的整数范围包含a和b,和之间的整数
frontera运行link_follower.py 报错:doesn't define any object named 'FIFO'
python爬虫 • 李魔佛 发表了文章 • 0 个评论 • 3556 次浏览 • 2019-07-18 11:29
from __future__ import print_function
import re
import requests
from frontera.contrib.requests.manager import RequestsFrontierManager
# from frontera.contrib.requests.manager import RequestsFrontierManager
from frontera import Settings
from six.moves.urllib.parse import urljoin
SETTINGS = Settings()
SETTINGS.BACKEND = 'frontera.contrib.backends.memory.FIFO'
# SETTINGS.BACKEND = 'frontera.contrib.backends.memory.MemoryDistributedBackend'
SETTINGS.LOGGING_MANAGER_ENABLED = True
SETTINGS.LOGGING_BACKEND_ENABLED = True
SETTINGS.MAX_REQUESTS = 100
SETTINGS.MAX_NEXT_REQUESTS = 10
SEEDS = [
'http://www.imdb.com',
]
LINK_RE = re.compile(r'<a.+?href="(.*?)".?>', re.I)
def extract_page_links(response):
return [urljoin(response.url, link) for link in LINK_RE.findall(response.text)]
if __name__ == '__main__':
frontier = RequestsFrontierManager(SETTINGS)
frontier.add_seeds([requests.Request(url=url) for url in SEEDS])
while True:
next_requests = frontier.get_next_requests()
if not next_requests:
break
for request in next_requests:
try:
response = requests.get(request.url)
links = [
requests.Request(url=url)
for url in extract_page_links(response)
]
frontier.page_crawled(response)
print('Crawled', response.url, '(found', len(links), 'urls)')
if links:
frontier.links_extracted(request, links)
except requests.RequestException as e:
error_code = type(e).__name__
frontier.request_error(request, error_code)
print('Failed to process request', request.url, 'Error:', e)
无论用的py2或者py3,都会报以下的错误。raise NameError("Module '%s' doesn't define any object named '%s'" % (module, name))
NameError: Module 'frontera.contrib.backends.memory' doesn't define any object named 'FIFO' 查看全部
from __future__ import print_function
import re
import requests
from frontera.contrib.requests.manager import RequestsFrontierManager
# from frontera.contrib.requests.manager import RequestsFrontierManager
from frontera import Settings
from six.moves.urllib.parse import urljoin
SETTINGS = Settings()
SETTINGS.BACKEND = 'frontera.contrib.backends.memory.FIFO'
# SETTINGS.BACKEND = 'frontera.contrib.backends.memory.MemoryDistributedBackend'
SETTINGS.LOGGING_MANAGER_ENABLED = True
SETTINGS.LOGGING_BACKEND_ENABLED = True
SETTINGS.MAX_REQUESTS = 100
SETTINGS.MAX_NEXT_REQUESTS = 10
SEEDS = [
'http://www.imdb.com',
]
LINK_RE = re.compile(r'<a.+?href="(.*?)".?>', re.I)
def extract_page_links(response):
return [urljoin(response.url, link) for link in LINK_RE.findall(response.text)]
if __name__ == '__main__':
frontier = RequestsFrontierManager(SETTINGS)
frontier.add_seeds([requests.Request(url=url) for url in SEEDS])
while True:
next_requests = frontier.get_next_requests()
if not next_requests:
break
for request in next_requests:
try:
response = requests.get(request.url)
links = [
requests.Request(url=url)
for url in extract_page_links(response)
]
frontier.page_crawled(response)
print('Crawled', response.url, '(found', len(links), 'urls)')
if links:
frontier.links_extracted(request, links)
except requests.RequestException as e:
error_code = type(e).__name__
frontier.request_error(request, error_code)
print('Failed to process request', request.url, 'Error:', e)
无论用的py2或者py3,都会报以下的错误。
raise NameError("Module '%s' doesn't define any object named '%s'" % (module, name))
NameError: Module 'frontera.contrib.backends.memory' doesn't define any object named 'FIFO'
scrapy-rabbitmq 不支持python3 [修改源码使它支持]
python爬虫 • 李魔佛 发表了文章 • 0 个评论 • 3297 次浏览 • 2019-07-17 17:24
在python3上运行的收会报错。
需要修改以下地方:
待续。。
在python3上运行的收会报错。
需要修改以下地方:
待续。。
scrapy rabbitmq 分布式爬虫
python爬虫 • 李魔佛 发表了文章 • 0 个评论 • 6069 次浏览 • 2019-07-17 16:59
rabbitmq是个不错的消息队列服务,可以配合scrapy作为消息队列.
下面是一个简单的demo:import re
import requests
import scrapy
from scrapy import Request
from rabbit_spider import settings
from scrapy.log import logger
import json
from rabbit_spider.items import RabbitSpiderItem
import datetime
from scrapy.selector import Selector
import pika
# from scrapy_rabbitmq.spiders import RabbitMQMixin
# from scrapy.contrib.spiders import CrawlSpider
class Website(scrapy.Spider):
name = "rabbit"
def start_requests(self):
headers = {'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
'Host': '36kr.com',
'Referer': 'https://36kr.com/information/web_news',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'
}
url = 'https://36kr.com/information/web_news'
yield Request(url=url,
headers=headers)
def parse(self, response):
credentials = pika.PlainCredentials('admin', 'admin')
connection = pika.BlockingConnection(pika.ConnectionParameters('192.168.1.101', 5672, '/', credentials))
channel = connection.channel()
channel.exchange_declare(exchange='direct_log', exchange_type='direct')
result = channel.queue_declare(exclusive=True, queue='')
queue_name = result.method.queue
# print(queue_name)
# infos = sys.argv[1:] if len(sys.argv)>1 else ['info']
info = 'info'
# 绑定多个值
channel.queue_bind(
exchange='direct_log',
routing_key=info,
queue=queue_name
)
print('start to receive [{}]'.format(info))
channel.basic_consume(
on_message_callback=self.callback_func,
queue=queue_name,
auto_ack=True,
)
channel.start_consuming()
def callback_func(self, ch, method, properties, body):
print(body)
启动spider:from scrapy import cmdline
cmdline.execute('scrapy crawl rabbit'.split())
然后往rabbitmq里面推送数据:import pika
import settings
credentials = pika.PlainCredentials('admin','admin')
connection = pika.BlockingConnection(pika.ConnectionParameters('192.168.1.101',5672,'/',credentials))
channel = connection.channel()
channel.exchange_declare(exchange='direct_log',exchange_type='direct') # fanout 就是组播
routing_key = 'info'
message='https://36kr.com/pp/api/aggregation-entity?type=web_latest_article&b_id=59499&per_page=30'
channel.basic_publish(
exchange='direct_log',
routing_key=routing_key,
body=message
)
print('sending message {}'.format(message))
connection.close()
推送数据后,scrapy会马上接受到队里里面的数据。
注意不能在start_requests里面写等待队列的命令,因为start_requests函数需要返回一个生成器,否则程序会报错。
待续。。。
###### 2019-08-29 更新 ###################
发现一个坑,就是rabbitMQ在接受到数据后,无法在回调函数里面使用yield生成器。
查看全部
rabbitmq是个不错的消息队列服务,可以配合scrapy作为消息队列.
下面是一个简单的demo:
import re
import requests
import scrapy
from scrapy import Request
from rabbit_spider import settings
from scrapy.log import logger
import json
from rabbit_spider.items import RabbitSpiderItem
import datetime
from scrapy.selector import Selector
import pika
# from scrapy_rabbitmq.spiders import RabbitMQMixin
# from scrapy.contrib.spiders import CrawlSpider
class Website(scrapy.Spider):
name = "rabbit"
def start_requests(self):
headers = {'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
'Host': '36kr.com',
'Referer': 'https://36kr.com/information/web_news',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'
}
url = 'https://36kr.com/information/web_news'
yield Request(url=url,
headers=headers)
def parse(self, response):
credentials = pika.PlainCredentials('admin', 'admin')
connection = pika.BlockingConnection(pika.ConnectionParameters('192.168.1.101', 5672, '/', credentials))
channel = connection.channel()
channel.exchange_declare(exchange='direct_log', exchange_type='direct')
result = channel.queue_declare(exclusive=True, queue='')
queue_name = result.method.queue
# print(queue_name)
# infos = sys.argv[1:] if len(sys.argv)>1 else ['info']
info = 'info'
# 绑定多个值
channel.queue_bind(
exchange='direct_log',
routing_key=info,
queue=queue_name
)
print('start to receive [{}]'.format(info))
channel.basic_consume(
on_message_callback=self.callback_func,
queue=queue_name,
auto_ack=True,
)
channel.start_consuming()
def callback_func(self, ch, method, properties, body):
print(body)
启动spider:
from scrapy import cmdline
cmdline.execute('scrapy crawl rabbit'.split())
然后往rabbitmq里面推送数据:
import pika
import settings
credentials = pika.PlainCredentials('admin','admin')
connection = pika.BlockingConnection(pika.ConnectionParameters('192.168.1.101',5672,'/',credentials))
channel = connection.channel()
channel.exchange_declare(exchange='direct_log',exchange_type='direct') # fanout 就是组播
routing_key = 'info'
message='https://36kr.com/pp/api/aggregation-entity?type=web_latest_article&b_id=59499&per_page=30'
channel.basic_publish(
exchange='direct_log',
routing_key=routing_key,
body=message
)
print('sending message {}'.format(message))
connection.close()
推送数据后,scrapy会马上接受到队里里面的数据。
注意不能在start_requests里面写等待队列的命令,因为start_requests函数需要返回一个生成器,否则程序会报错。
待续。。。
###### 2019-08-29 更新 ###################
发现一个坑,就是rabbitMQ在接受到数据后,无法在回调函数里面使用yield生成器。
exchange_declare() got an unexpected keyword argument 'type'
李魔佛 发表了文章 • 0 个评论 • 3018 次浏览 • 2019-07-16 14:40
exchange_type instead of type
credentials = pika.PlainCredentials('admin','admin')
connection = pika.BlockingConnection(pika.ConnectionParameters('192.168.1.101',5672,'/',credentials))
channel = connection.channel()
channel.exchange_declare(exchange='logs',exchange_type='fanout') 查看全部
exchange_type instead of type
credentials = pika.PlainCredentials('admin','admin')
connection = pika.BlockingConnection(pika.ConnectionParameters('192.168.1.101',5672,'/',credentials))
channel = connection.channel()
channel.exchange_declare(exchange='logs',exchange_type='fanout')
twisted的getPage已经不建议使用,新接口为twisted.web.client.Agent
python爬虫 • 李魔佛 发表了文章 • 2 个评论 • 3718 次浏览 • 2019-07-12 11:31
We need to change these tests to use twisted.web.client.Agent instead, or a package named "treq", which is a Twisted flavor of the excellent (but blocking) requests library.
查看全部
Twisted-16.7.0 is coming soon, and it deprecates twisted.web.client.getPage (and client.HTTPClientFactory). We use these in some of the unit tests, to fetch one of the HTTP WAPI/WUI pages and make sure the contents look right.
We need to change these tests to use twisted.web.client.Agent instead, or a package named "treq", which is a Twisted flavor of the excellent (but blocking) requests library.
twisted reactor运行后,添加了addBoth函数,但是还是无法停止
李魔佛 发表了文章 • 0 个评论 • 4079 次浏览 • 2019-07-11 09:43
from scrapy.selector import Selector
def get_response_callback(content):
txt = str(content,encoding='utf-8')
resp = Selector(text=txt)
title = resp.xpath('//title/text()').extract_first()
print(title)
@defer.inlineCallbacks
def task():
url = 'http://www.baidu.com'
d=getPage(url.encode('utf-8'))
d.addCallback(get_response_callback)
yield d
def done():
reactor.stop()
def done1(*args,**kwargs):
reactor.stop()
task_list =
for i in range(4):
d=task()
task_list.append(d)
dd = defer.DeferredList(task_list)
dd.addBoth(done)
reactor.run()
上面的代码是无法停止的,如果使用的是
dd.addBoth(done)
done函数的定义是没有参数的。
而使用另一个done函数带参数的done(*args,**kwargs)
是可以正常退出的,done里面写了reactor.stop() 函数
原创文章
转载请注明出处:
http://30daydo.com/article/509
查看全部
from scrapy.selector import Selector
def get_response_callback(content):
txt = str(content,encoding='utf-8')
resp = Selector(text=txt)
title = resp.xpath('//title/text()').extract_first()
print(title)
@defer.inlineCallbacks
def task():
url = 'http://www.baidu.com'
d=getPage(url.encode('utf-8'))
d.addCallback(get_response_callback)
yield d
def done():
reactor.stop()
def done1(*args,**kwargs):
reactor.stop()
task_list =
for i in range(4):
d=task()
task_list.append(d)
dd = defer.DeferredList(task_list)
dd.addBoth(done)
reactor.run()
上面的代码是无法停止的,如果使用的是
dd.addBoth(done)
done函数的定义是没有参数的。
而使用另一个done函数带参数的done(*args,**kwargs)
是可以正常退出的,done里面写了reactor.stop() 函数
原创文章
转载请注明出处:
http://30daydo.com/article/509
cv2 distanceTransform函数的用法 python
李魔佛 发表了文章 • 0 个评论 • 11605 次浏览 • 2019-07-08 15:35
Calculates the distance to the closest zero pixel for each pixel of the source image.
Python: cv2.distanceTransform(src, distanceType, maskSize[, dst]) → dst
Python: cv.DistTransform(src, dst, distance_type=CV_DIST_L2, mask_size=3, mask=None, labels=None) → None
Parameters:
src – 8-bit, single-channel (binary) source image.
dst – Output image with calculated distances. It is a 32-bit floating-point, single-channel image of the same size as src .
distanceType – Type of distance. It can be CV_DIST_L1, CV_DIST_L2 , or CV_DIST_C .
maskSize – Size of the distance transform mask. It can be 3, 5, or CV_DIST_MASK_PRECISE (the latter option is only supported by the first function). In case of the CV_DIST_L1 or CV_DIST_C distance type, the parameter is forced to 3 because a 3\times 3 mask gives the same result as 5\times 5 or any larger aperture.
labels – Optional output 2D array of labels (the discrete Voronoi diagram). It has the type CV_32SC1 and the same size as src . See the details below.
labelType – Type of the label array to build. If labelType==DIST_LABEL_CCOMP then each connected component of zeros in src (as well as all the non-zero pixels closest to the connected component) will be assigned the same label. If labelType==DIST_LABEL_PIXEL then each zero pixel (and all the non-zero pixels closest to it) gets its own label.
The functions distanceTransform calculate the approximate or precise distance from every binary image pixel to the nearest zero pixel. For zero image pixels, the distance will obviously be zero.
When maskSize == CV_DIST_MASK_PRECISE and distanceType == CV_DIST_L2 , the function runs the algorithm described in [Felzenszwalb04]. This algorithm is parallelized with the TBB library.
In other cases, the algorithm [Borgefors86] is used. This means that for a pixel the function finds the shortest path to the nearest zero pixel consisting of basic shifts: horizontal, vertical, diagonal, or knight’s move (the latest is available for a 5\times 5 mask). The overall distance is calculated as a sum of these basic distances. Since the distance function should be symmetric, all of the horizontal and vertical shifts must have the same cost (denoted as a ), all the diagonal shifts must have the same cost (denoted as b ), and all knight’s moves must have the same cost (denoted as c ). For the CV_DIST_C and CV_DIST_L1 types, the distance is calculated precisely, whereas for CV_DIST_L2 (Euclidean distance) the distance can be calculated only with a relative error (a 5\times 5 mask gives more accurate results). For a,``b`` , and c , OpenCV uses the values suggested in the original paper:
CV_DIST_C (3\times 3) a = 1, b = 1
CV_DIST_L1 (3\times 3) a = 1, b = 2
CV_DIST_L2 (3\times 3) a=0.955, b=1.3693
CV_DIST_L2 (5\times 5) a=1, b=1.4, c=2.1969
Typically, for a fast, coarse distance estimation CV_DIST_L2, a 3\times 3 mask is used. For a more accurate distance estimation CV_DIST_L2 , a 5\times 5 mask or the precise algorithm is used. Note that both the precise and the approximate algorithms are linear on the number of pixels.
The second variant of the function does not only compute the minimum distance for each pixel (x, y) but also identifies the nearest connected component consisting of zero pixels (labelType==DIST_LABEL_CCOMP) or the nearest zero pixel (labelType==DIST_LABEL_PIXEL). Index of the component/pixel is stored in \texttt{labels}(x, y) . When labelType==DIST_LABEL_CCOMP, the function automatically finds connected components of zero pixels in the input image and marks them with distinct labels. When labelType==DIST_LABEL_CCOMP, the function scans through the input image and marks all the zero pixels with distinct labels.
In this mode, the complexity is still linear. That is, the function provides a very fast way to compute the Voronoi diagram for a binary image. Currently, the second variant can use only the approximate distance transform algorithm, i.e. maskSize=CV_DIST_MASK_PRECISE is not supported yet.
Note
An example on using the distance transform can be found at opencv_source_code/samples/cpp/distrans.cpp
(Python) An example on using the distance transform can be found at opencv_source/samples/python2/distrans.py
查看全部
distanceTransform
Calculates the distance to the closest zero pixel for each pixel of the source image.
Python: cv2.distanceTransform(src, distanceType, maskSize[, dst]) → dst
Python: cv.DistTransform(src, dst, distance_type=CV_DIST_L2, mask_size=3, mask=None, labels=None) → None
Parameters:
src – 8-bit, single-channel (binary) source image.
dst – Output image with calculated distances. It is a 32-bit floating-point, single-channel image of the same size as src .
distanceType – Type of distance. It can be CV_DIST_L1, CV_DIST_L2 , or CV_DIST_C .
maskSize – Size of the distance transform mask. It can be 3, 5, or CV_DIST_MASK_PRECISE (the latter option is only supported by the first function). In case of the CV_DIST_L1 or CV_DIST_C distance type, the parameter is forced to 3 because a 3\times 3 mask gives the same result as 5\times 5 or any larger aperture.
labels – Optional output 2D array of labels (the discrete Voronoi diagram). It has the type CV_32SC1 and the same size as src . See the details below.
labelType – Type of the label array to build. If labelType==DIST_LABEL_CCOMP then each connected component of zeros in src (as well as all the non-zero pixels closest to the connected component) will be assigned the same label. If labelType==DIST_LABEL_PIXEL then each zero pixel (and all the non-zero pixels closest to it) gets its own label.
The functions distanceTransform calculate the approximate or precise distance from every binary image pixel to the nearest zero pixel. For zero image pixels, the distance will obviously be zero.
When maskSize == CV_DIST_MASK_PRECISE and distanceType == CV_DIST_L2 , the function runs the algorithm described in [Felzenszwalb04]. This algorithm is parallelized with the TBB library.
In other cases, the algorithm [Borgefors86] is used. This means that for a pixel the function finds the shortest path to the nearest zero pixel consisting of basic shifts: horizontal, vertical, diagonal, or knight’s move (the latest is available for a 5\times 5 mask). The overall distance is calculated as a sum of these basic distances. Since the distance function should be symmetric, all of the horizontal and vertical shifts must have the same cost (denoted as a ), all the diagonal shifts must have the same cost (denoted as b ), and all knight’s moves must have the same cost (denoted as c ). For the CV_DIST_C and CV_DIST_L1 types, the distance is calculated precisely, whereas for CV_DIST_L2 (Euclidean distance) the distance can be calculated only with a relative error (a 5\times 5 mask gives more accurate results). For a,``b`` , and c , OpenCV uses the values suggested in the original paper:
CV_DIST_C (3\times 3) a = 1, b = 1
CV_DIST_L1 (3\times 3) a = 1, b = 2
CV_DIST_L2 (3\times 3) a=0.955, b=1.3693
CV_DIST_L2 (5\times 5) a=1, b=1.4, c=2.1969
Typically, for a fast, coarse distance estimation CV_DIST_L2, a 3\times 3 mask is used. For a more accurate distance estimation CV_DIST_L2 , a 5\times 5 mask or the precise algorithm is used. Note that both the precise and the approximate algorithms are linear on the number of pixels.
The second variant of the function does not only compute the minimum distance for each pixel (x, y) but also identifies the nearest connected component consisting of zero pixels (labelType==DIST_LABEL_CCOMP) or the nearest zero pixel (labelType==DIST_LABEL_PIXEL). Index of the component/pixel is stored in \texttt{labels}(x, y) . When labelType==DIST_LABEL_CCOMP, the function automatically finds connected components of zero pixels in the input image and marks them with distinct labels. When labelType==DIST_LABEL_CCOMP, the function scans through the input image and marks all the zero pixels with distinct labels.
In this mode, the complexity is still linear. That is, the function provides a very fast way to compute the Voronoi diagram for a binary image. Currently, the second variant can use only the approximate distance transform algorithm, i.e. maskSize=CV_DIST_MASK_PRECISE is not supported yet.
Note
An example on using the distance transform can be found at opencv_source_code/samples/cpp/distrans.cpp
(Python) An example on using the distance transform can be found at opencv_source/samples/python2/distrans.py