python

asyncio 异步爬取vs requests同步爬取性能对比

python爬虫 • 李魔佛发表了文章 • 0 个评论 • 2575 次浏览 • 2020-11-25 11:21 • 来自相关话题

首先是异步爬取：
import sys
sys.path.append('..')
import asyncio
import datetime
import aiohttp
import re
import time
from parsel import Selector
from configure.settings import DBSelector
from common.BaseService import BaseService

SLEEP = 2

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2'}

URL_MAP = {'home_page': 'https://holdle.com/stocks/industry', 'base': 'https://holdle.com'}

class AsyncMongo():
def __init__(self):
self.DB = DBSelector()
self.client = self.DB.mongo(location_type='qq', async_type=True)
self.db = self.client['db_stock']

async def update(self, table,data):
self.doc= self.db[table]
await self.doc.insert_many(data)

class Holdle(BaseService):

def __init__(self):
super(Holdle, self).__init__()
self.data_processor = AsyncMongo()
self.tables_list =['ROE','Cash_Ratio','Gross_Margin','Operation_Margin','Net_Profit_Ratio','Dividend_ratio']

async def home_page(self):
start = time.time()
async with aiohttp.ClientSession() as session:
async with session.get(url=URL_MAP['home_page'], headers=headers) as response:
html = await response.text() # 这个阻塞
resp = Selector(text=html)
industries = resp.xpath('//ul[@class="list-unstyled"]/a')
task_list = []
for industry in industries:
json_data = {}
industry_url = industry.xpath('.//@href').extract_first()
industry_name = industry.xpath('.//li/text()').extract_first()
industry_name = industry_name.replace('-', '').strip()
json_data['industry_url'] = industry_url
json_data['industry_name'] = industry_name

task = asyncio.ensure_future(self.detail_list(session, industry_url, json_data))
task_list.append(task)

await asyncio.gather(*task_list)
end = time.time()

print(f'time used {end - start}')

async def detail_list(self, session, url, json_data):

async with session.get(URL_MAP['base'] + url, headers=headers) as response:
response = await response.text()
await self.parse_detail(response, json_data)

async def parse_detail(self, html, json_data=None):
resp = Selector(text=html)
industry=json_data['industry_name']
tables = resp.xpath('//table[@class="table table-bordered"]')
if len(tables)!=6:
raise ValueError

for index,table in enumerate(self.tables_list):
rows = tables[index].xpath('.//tr')
result = []
for row in rows[1:]:
stock_name = row.xpath('.//td[1]/text()').extract_first()
value = row.xpath('.//td[2]/text()').extract_first()
value = float(value)
d={'industry':industry,'name':stock_name,'value':value,'crawltime':datetime.datetime.now()}
result.append(d)
await self.data_processor.update(table,result)

app = Holdle()
loop = asyncio.get_event_loop()
loop.run_until_complete(app.home_page())
爬完并且入库，用时大约为35s

使用requests爬取
# -*- coding: utf-8 -*-
# @Time : 2020/11/24 21:42
# @File : sync_spider.py
# @Author : Rocky C@www.30daydo.com
import requests
import sys
sys.path.append('..')
import asyncio
import datetime
import aiohttp
import re
import time
from parsel import Selector
from configure.settings import DBSelector
from common.BaseService import BaseService

SLEEP = 2

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2'}

URL_MAP = {'home_page': 'https://holdle.com/stocks/industry', 'base': 'https://holdle.com'}

class Holdle(BaseService):

def __init__(self):
super(Holdle, self).__init__()

self.DB = DBSelector()
self.client = self.DB.mongo(location_type='qq', async_type=True)
self.session = requests.Session()

def run(self):
start = time.time()

response = self.session.get(url=URL_MAP['home_page'], headers=headers)
html = response.text # 这个阻塞
resp = Selector(text=html)
industries = resp.xpath('//ul[@class="list-unstyled"]/a')
for industry in industries:
json_data = {}
industry_url = industry.xpath('.//@href').extract_first()
industry_name = industry.xpath('.//li/text()').extract_first()
json_data['industry_url'] = industry_url
json_data['industry_name'] = industry_name
self.detail_list(industry_url, json_data)

end = time.time()
print(f'time used {end-start}')

def detail_list(self, url, json_data):

response = self.session.get(URL_MAP['base']+url, headers=headers)
response =response.text
self.parse_detail(response, json_data)

def parse_detail(self, html, json_data=None):
resp = Selector(text=html)
title =resp.xpath('//title/text()').extract_first()
print(title)

app = Holdle()
app.run()
用时约160s，而且这里还省略了mongo入库的时间。上面异步爬取里面包含了异步存入mongo。

所以单从网络IO性能上来说，异步是比纯同步要快很多。
但是，async的生态做得不是太好，第三方的异步框架做得也不够完善。

因为如果系统中引入了异步，很多耗时的地方也是需要使用异步的写法和框架，不然会导致系统的控制权没有被正确转移。

水文一篇。
完毕
查看全部

首先是异步爬取：

import sys

sys.path.append('..')

import asyncio

import datetime

import aiohttp

import re

import time

from parsel import Selector

from configure.settings import DBSelector

from common.BaseService import BaseService



SLEEP = 2



headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0',

           'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2'}



URL_MAP = {'home_page': 'https://holdle.com/stocks/industry', 'base': 'https://holdle.com'}





class AsyncMongo():

    def __init__(self):

        self.DB = DBSelector()

        self.client = self.DB.mongo(location_type='qq', async_type=True)

        self.db = self.client['db_stock']



    async def update(self, table,data):

        self.doc= self.db[table]

        await self.doc.insert_many(data)





class Holdle(BaseService):



    def __init__(self):

        super(Holdle, self).__init__()

        self.data_processor = AsyncMongo()

        self.tables_list =['ROE','Cash_Ratio','Gross_Margin','Operation_Margin','Net_Profit_Ratio','Dividend_ratio']



    async def home_page(self):

        start = time.time()

        async with aiohttp.ClientSession() as session:

            async with session.get(url=URL_MAP['home_page'], headers=headers) as response:

                html = await response.text()  # 这个阻塞

                resp = Selector(text=html)

                industries = resp.xpath('//ul[@class="list-unstyled"]/a')

                task_list = []

                for industry in industries:

                    json_data = {}

                    industry_url = industry.xpath('.//@href').extract_first()

                    industry_name = industry.xpath('.//li/text()').extract_first()

                    industry_name = industry_name.replace('-', '').strip()

                    json_data['industry_url'] = industry_url

                    json_data['industry_name'] = industry_name



                    task = asyncio.ensure_future(self.detail_list(session, industry_url, json_data))

                    task_list.append(task)



                await asyncio.gather(*task_list)

                end = time.time()



                print(f'time used {end - start}')



    async def detail_list(self, session, url, json_data):



        async with session.get(URL_MAP['base'] + url, headers=headers) as response:

            response = await response.text()

            await self.parse_detail(response, json_data)



    async def parse_detail(self, html, json_data=None):

            resp = Selector(text=html)

            industry=json_data['industry_name']

            tables = resp.xpath('//table[@class="table table-bordered"]')

            if len(tables)!=6:

                raise ValueError



            for index,table in enumerate(self.tables_list):

                rows = tables[index].xpath('.//tr')

                result = []

                for row in rows[1:]:

                    stock_name = row.xpath('.//td[1]/text()').extract_first()

                    value = row.xpath('.//td[2]/text()').extract_first()

                    value = float(value)

                    d={'industry':industry,'name':stock_name,'value':value,'crawltime':datetime.datetime.now()}

                    result.append(d)

                await self.data_processor.update(table,result)





app = Holdle()

loop = asyncio.get_event_loop()

loop.run_until_complete(app.home_page())

爬完并且入库，用时大约为35s

使用requests爬取

# -*- coding: utf-8 -*-

# @Time : 2020/11/24 21:42

# @File : sync_spider.py

# @Author : Rocky C@www.30daydo.com

import requests

import sys

sys.path.append('..')

import asyncio

import datetime

import aiohttp

import re

import time

from parsel import Selector

from configure.settings import DBSelector

from common.BaseService import BaseService



SLEEP = 2



headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0',

           'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2'}



URL_MAP = {'home_page': 'https://holdle.com/stocks/industry', 'base': 'https://holdle.com'}





class Holdle(BaseService):



    def __init__(self):

        super(Holdle, self).__init__()



        self.DB = DBSelector()

        self.client = self.DB.mongo(location_type='qq', async_type=True)

        self.session = requests.Session()



    def run(self):

        start = time.time()



        response = self.session.get(url=URL_MAP['home_page'], headers=headers)

        html =  response.text  # 这个阻塞

        resp = Selector(text=html)

        industries = resp.xpath('//ul[@class="list-unstyled"]/a')

        for industry in industries:

            json_data = {}

            industry_url = industry.xpath('.//@href').extract_first()

            industry_name = industry.xpath('.//li/text()').extract_first()

            json_data['industry_url'] = industry_url

            json_data['industry_name'] = industry_name

            self.detail_list(industry_url, json_data)



        end = time.time()

        print(f'time used {end-start}')



    def detail_list(self, url, json_data):



        response = self.session.get(URL_MAP['base']+url, headers=headers)

        response =response.text

        self.parse_detail(response, json_data)



    def parse_detail(self, html, json_data=None):

        resp = Selector(text=html)

        title =resp.xpath('//title/text()').extract_first()

        print(title)





app = Holdle()

app.run()

用时约160s，而且这里还省略了mongo入库的时间。上面异步爬取里面包含了异步存入mongo。

所以单从网络IO性能上来说，异步是比纯同步要快很多。
但是，async的生态做得不是太好，第三方的异步框架做得也不够完善。

因为如果系统中引入了异步，很多耗时的地方也是需要使用异步的写法和框架，不然会导致系统的控制权没有被正确转移。

水文一篇。
完毕

夜深了，你们还在吗？

chenchen 发表了文章 • 0 个评论 • 1783 次浏览 • 2020-11-20 22:34 • 来自相关话题

夜深了，你们还在吗？？？？？？？？？？？？？？？？？？？

大家好啊，日常报道，关照关照

chenchen 发表了文章 • 0 个评论 • 1638 次浏览 • 2020-11-20 17:11 • 来自相关话题

大家好啊，日常报道，关照关照。。。。。。。。。。。。

异步asyncio加锁的正确用法

李魔佛发表了文章 • 0 个评论 • 4619 次浏览 • 2020-11-15 10:19 • 来自相关话题

对于全局变量count进行统计加锁
import aiohttp
import asyncio
import execjs
import threading
global pages
global count

headers = {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "en-US,en;q=0.9",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Host": "dcfm.eastmoney.com",
"Pragma": "no-cache",
"Referer": "http://data.eastmoney.com/xg/xg/default.html",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/69.0.3497.81 Chrome/69.0.3497.81 Safari/537.36",
}

home_url = 'http://dcfm.eastmoney.com/em_mutisvcexpandinterface/api/js/get?type=XGSG_LB&token=70f12f2f4f091e459a279469fe49eca5&st=purchasedate,securitycode&sr=-1&p={}&ps=50&js=var%20hsEnHLwG={{pages:(tp),data:(x)}}&rt=53512217'

loop = asyncio.get_event_loop()
# lock = threading.Lock()
lock = asyncio.Lock()
def parse_json(content):
content += ';function getV(){return hsEnHLwG;}'
ctx = execjs.compile(content)
result = ctx.call('getV')
return result

async def fetch(session,page):
global pages
global count
async with session.get(home_url.format(page),headers=headers) as resp:
# print(f'here：： {page}')
content = await resp.text()

try:
js_content = parse_json(content)
for stock_info in js_content['data']:
securityshortname = stock_info['securityshortname']
# print(securityshortname)
except Exception as e:
print(e)

async with lock:
count=count+1

print(f'count:{count}')
if count == pages:
print('End of loop')
loop.stop()

async def main():
global pages
global count
count=0
async with aiohttp.ClientSession() as session:
async with session.get(home_url.format(1), headers=headers) as resp:

content = await resp.text()
js_data = parse_json(content)
pages = js_data['pages']
print(f'pages: {pages}')
for page in range(1,pages+1):
task = asyncio.ensure_future(fetch(session,page))

await asyncio.sleep(1)

asyncio.ensure_future(main())
loop.run_forever()
1. 如果不加入锁，每次运行的结果可能不一样。
2. 不能用多线程的threading 锁，得到的每次运行结果也有可能不一样
3. 用asyncio的锁要加关键字 async
查看全部

对于全局变量count进行统计加锁

import aiohttp

import asyncio

import execjs

import threading

global pages

global count



headers = {

    "Accept": "*/*",

    "Accept-Encoding": "gzip, deflate",

    "Accept-Language": "en-US,en;q=0.9",

    "Cache-Control": "no-cache",

    "Connection": "keep-alive",

    "Host": "dcfm.eastmoney.com",

    "Pragma": "no-cache",

    "Referer": "http://data.eastmoney.com/xg/xg/default.html",

    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/69.0.3497.81 Chrome/69.0.3497.81 Safari/537.36",

}



home_url = 'http://dcfm.eastmoney.com/em_mutisvcexpandinterface/api/js/get?type=XGSG_LB&token=70f12f2f4f091e459a279469fe49eca5&st=purchasedate,securitycode&sr=-1&p={}&ps=50&js=var%20hsEnHLwG={{pages:(tp),data:(x)}}&rt=53512217'



loop = asyncio.get_event_loop()

# lock = threading.Lock()

lock = asyncio.Lock()

def parse_json(content):

    content += ';function getV(){return hsEnHLwG;}'

    ctx = execjs.compile(content)

    result = ctx.call('getV')

    return result





async def fetch(session,page):

    global pages

    global count

    async with session.get(home_url.format(page),headers=headers) as resp:

        # print(f'here：： {page}')

        content = await resp.text()



        try:

            js_content = parse_json(content)

            for stock_info in js_content['data']:

                securityshortname = stock_info['securityshortname']

                # print(securityshortname)

        except Exception as e:

            print(e)



        async with lock:

            count=count+1



        print(f'count:{count}')

        if count == pages:

            print('End of loop')

            loop.stop()







async def main():

    global pages

    global count

    count=0

    async with aiohttp.ClientSession() as session:

        async with session.get(home_url.format(1), headers=headers) as resp:



            content = await resp.text()

            js_data = parse_json(content)

            pages = js_data['pages']

            print(f'pages: {pages}')

            for page in range(1,pages+1):

                task = asyncio.ensure_future(fetch(session,page))



            await asyncio.sleep(1)





asyncio.ensure_future(main())

loop.run_forever()

1. 如果不加入锁，每次运行的结果可能不一样。
2. 不能用多线程的threading 锁，得到的每次运行结果也有可能不一样
3. 用asyncio的锁要加关键字 async

attrs() got an unexpected keyword argument 'eq'

李魔佛发表了文章 • 0 个评论 • 2675 次浏览 • 2020-11-12 22:42 • 来自相关话题

xda@xda-dt:~$ pip install attrs --upgrade
Collecting attrs
Downloading https://files.pythonhosted.org ... y.whl (49kB)
100% |████████████████████████████████| 51kB 79kB/s
Installing collected packages: attrs
Found existing installation: attrs 18.2.0
Uninstalling attrs-18.2.0:
Successfully uninstalled attrs-18.2.0
Successfully installed attrs-20.3.0 查看全部

xda@xda-dt:~$ pip install attrs --upgrade
Collecting attrs
Downloading https://files.pythonhosted.org ... y.whl (49kB)
100% |████████████████████████████████| 51kB 79kB/s
Installing collected packages: attrs
Found existing installation: attrs 18.2.0
Uninstalling attrs-18.2.0:
Successfully uninstalled attrs-18.2.0
Successfully installed attrs-20.3.0

pyecharts绘图保存为图片适用于ssh无头浏览器运行

李魔佛发表了文章 • 0 个评论 • 2450 次浏览 • 2020-11-04 22:27 • 来自相关话题

网上搜索到的答案是使用chrome driver实现的，但是本人的程序是运行在centos下的，centos下折腾chrome driver比较蛋疼，所以看了下pyecharts.render的源码，其实这个也支持使用无头phantomjs进行截图的，当然这个不是一般的直接截取屏幕，是通过JS代码把html里面的渲染图像下载下来，清晰度比普通截图要高很多很多。

make_snapshot(snapshot, bar.render(), f"data/{today}_cb.png", driver=driver)

在最后一行传入一个driver既可以了，这个driver使用phantomjs的实例。

import os
from pyecharts.render import make_snapshot
from snapshot_selenium import snapshot
import pandas as pd
from pyecharts import options as opts
from pyecharts.charts import Bar
import sys
from selenium import webdriver
from pyecharts.commons.utils import JsCode

if sys.platform == 'win32':
SELENIUM_PATH = r'C:\OneDrive\Tool\phantomjs-2.1.1-windows\phantomjs-2.1.1-windows\bin\phantomjs.exe'
driver = None
else:
SELENIUM_PATH = './phantomjs'
driver = webdriver.PhantomJS(executable_path=SELENIUM_PATH)

bar = (
Bar()
.add_xaxis(list(result_dict .keys()))
.add_yaxis(f"{today}-可转债价格分布", y_list, category_gap=3)
.add_yaxis(f"{today}-正股价格分布", y_zg_list, category_gap=3)
.set_series_opts(
label_opts=opts.LabelOpts(is_show=True),
axispointer_opts=opts.AxisPointerOpts(is_show=True))
.set_global_opts(
title_opts=opts.TitleOpts(title="可转债价格分布"),
xaxis_opts=opts.AxisOpts(
name="涨跌幅",
is_show=True,
name_rotate=30,
),
graphic_opts=[
opts.GraphicGroup(
graphic_item=opts.GraphicItem(
left="70%",
top="20%",
),
children=[
opts.GraphicText(
graphic_item=opts.GraphicItem(
left="center",
top="middle",
z=100,
),
graphic_textstyle_opts=opts.GraphicTextStyleOpts(
text=JsCode(
f"['涨幅>=0：{bigger}',"
f"'涨幅<0：{smaller}',"
f"'平均涨幅：{avg}%',"
f"'波动方差：{std}',"
f"'',"
f"'最大：{max_name} {max_pct}%',"
f"'最小：{min_name} {min_pct}%',"
"''].join('\\n')"
),
font="14px Microsoft YaHei",
graphic_basicstyle_opts=opts.GraphicBasicStyleOpts(
fill="#333"
)
)
)
]
)
],
)

)

bar.render(os.path.join('data', f"{today}_cb.html"))
make_snapshot(snapshot, bar.render(), f"data/{today}_cb.png", driver=driver)

查看全部

网上搜索到的答案是使用chrome driver实现的，但是本人的程序是运行在centos下的，centos下折腾chrome driver比较蛋疼，所以看了下pyecharts.render的源码，其实这个也支持使用无头phantomjs进行截图的，当然这个不是一般的直接截取屏幕，是通过JS代码把html里面的渲染图像下载下来，清晰度比普通截图要高很多很多。

make_snapshot(snapshot, bar.render(), f"data/{today}_cb.png", driver=driver)

在最后一行传入一个driver既可以了，这个driver使用phantomjs的实例。

import os

from pyecharts.render import make_snapshot

from snapshot_selenium import snapshot

import pandas as pd

from pyecharts import options as opts

from pyecharts.charts import Bar

import sys

from selenium import webdriver

from pyecharts.commons.utils import JsCode



if sys.platform == 'win32':

    SELENIUM_PATH = r'C:\OneDrive\Tool\phantomjs-2.1.1-windows\phantomjs-2.1.1-windows\bin\phantomjs.exe'

    driver = None

else:

    SELENIUM_PATH = './phantomjs'

    driver = webdriver.PhantomJS(executable_path=SELENIUM_PATH)

    

    

    bar = (

        Bar()

            .add_xaxis(list(result_dict .keys()))

            .add_yaxis(f"{today}-可转债价格分布", y_list, category_gap=3)

            .add_yaxis(f"{today}-正股价格分布", y_zg_list, category_gap=3)

            .set_series_opts(

            label_opts=opts.LabelOpts(is_show=True),

            axispointer_opts=opts.AxisPointerOpts(is_show=True))

            .set_global_opts(

            title_opts=opts.TitleOpts(title="可转债价格分布"),

            xaxis_opts=opts.AxisOpts(

                name="涨跌幅",

                is_show=True,

                name_rotate=30,

            ),

            graphic_opts=[

                opts.GraphicGroup(

                    graphic_item=opts.GraphicItem(

                        left="70%",

                        top="20%",

                    ),

                    children=[

                        opts.GraphicText(

                            graphic_item=opts.GraphicItem(

                                left="center",

                                top="middle",

                                z=100,

                            ),

                            graphic_textstyle_opts=opts.GraphicTextStyleOpts(

                                text=JsCode(

                                    f"['涨幅>=0：{bigger}',"

                                    f"'涨幅<0：{smaller}',"

                                    f"'平均涨幅：{avg}%',"

                                    f"'波动方差：{std}',"

                                    f"'',"

                                    f"'最大：{max_name}  {max_pct}%',"

                                    f"'最小：{min_name}  {min_pct}%',"

                                    "''].join('\\n')"

                                ),

                                font="14px Microsoft YaHei",

                                graphic_basicstyle_opts=opts.GraphicBasicStyleOpts(

                                    fill="#333"

                                )

                            )

                        )

                    ]

                )

            ],

        )



    )



    bar.render(os.path.join('data', f"{today}_cb.html"))

    make_snapshot(snapshot, bar.render(), f"data/{today}_cb.png", driver=driver)

使用sshtunnel SSHTunnelForwarder 作为跳板连接mysql后一直卡住不退出

李魔佛发表了文章 • 0 个评论 • 5323 次浏览 • 2020-11-04 10:10 • 来自相关话题

代码如下：server = SSHTunnelForwarder(
ssh_address_or_host=host,
ssh_port=port,
ssh_username=user,
ssh_password=password,
local_bind_address=('127.0.0.1', local_port),
remote_bind_address=(host, mysql_port)
)

server.start()
conn = pymysql.connect(
host='127.0.0.1',
port=local_port,
user=user,
password=password,
db='db_stock'
)

cursor = conn.cursor()
cursor.execute('select count(*) from tb_cb_index')
ret = cursor.fetchall()
print(ret)
server.stop()
print('stop')

代码运行后并没有结束，或者没有答应stop的字符。在程序里已经使用了server.stop（）关闭ssh的连接。

后面发现日志里面，mysql的连接没有断开，导致server没有被关闭，所以在上面的代码中加一句：
print(ret)
conn.close()
server.stop()
print('stop')
把mysql的连接关闭，然后就可以把ssh的连接关闭，然后打印stop字符了。
查看全部

代码如下：

server = SSHTunnelForwarder(

    ssh_address_or_host=host,

    ssh_port=port,

    ssh_username=user,

    ssh_password=password,

    local_bind_address=('127.0.0.1', local_port),

    remote_bind_address=(host, mysql_port)

)



server.start()

conn = pymysql.connect(

    host='127.0.0.1',

    port=local_port,

    user=user,

    password=password,

    db='db_stock'

)



cursor = conn.cursor()

cursor.execute('select count(*) from tb_cb_index')

ret = cursor.fetchall()

print(ret)

server.stop()

print('stop')

代码运行后并没有结束，或者没有答应stop的字符。在程序里已经使用了server.stop（）关闭ssh的连接。

后面发现日志里面，mysql的连接没有断开，导致server没有被关闭，所以在上面的代码中加一句：

print(ret)

conn.close()

server.stop()

print('stop')

把mysql的连接关闭，然后就可以把ssh的连接关闭，然后打印stop字符了。

什么是http代理ip

python爬虫 • wanbainip 发起了问题 • 1 人关注 • 0 个回复 • 2564 次浏览 • 2020-11-03 18:17 • 来自相关话题

pyecharts感觉这个库的设计有点业余，太过于反python了

李魔佛发表了文章 • 0 个评论 • 2033 次浏览 • 2020-11-03 01:09 • 来自相关话题

难道不知道python的数据分析用的是numpy的数据类型么？

一些扩展的类型int64作为数据导入到绘图函数中，居然不兼容，而且也不报错。
经过逐个参数的排查，才发现是数据类型的问题。数据是直接在pandas里面抽取出来的，然后填充到绘图汇总，然后就显示一片空白，还一直以为是我哪个参数用错了。

也许是从前端设计者手里接过来的项目吧。按葫芦画瓢这样操作。

一个函数里面居然可以放置几十个，上百个参数，我滴神呀，而且参数里面还有其他类型的参数，尼玛！

当然最让人头疼是那往上不兼容的做法，现在的新版本完全已经修改了，不兼容旧版本，之前的写代码又是一通修改。查看全部

难道不知道python的数据分析用的是numpy的数据类型么？

一些扩展的类型int64作为数据导入到绘图函数中，居然不兼容，而且也不报错。
经过逐个参数的排查，才发现是数据类型的问题。数据是直接在pandas里面抽取出来的，然后填充到绘图汇总，然后就显示一片空白，还一直以为是我哪个参数用错了。

也许是从前端设计者手里接过来的项目吧。按葫芦画瓢这样操作。

一个函数里面居然可以放置几十个，上百个参数，我滴神呀，而且参数里面还有其他类型的参数，尼玛！

当然最让人头疼是那往上不兼容的做法，现在的新版本完全已经修改了，不兼容旧版本，之前的写代码又是一通修改。

免费代理ip与收费的代理ip

python爬虫 • wanbainip 发表了文章 • 0 个评论 • 2639 次浏览 • 2020-10-30 18:00 • 来自相关话题

在爬虫采集数据过程中，经常会遇到ip被限制，这也是常见的问题。因为网站都会根据ip访问的频率进行判断封锁ip,这是反爬虫机制的策略，如果拥有大量的ip资源，自然会突破ip限制的反爬虫策略。

曾经有尝试过使用免费的代理ip来搭建代理池，可是免费的代理ip不仅资源少，而且可用率、高匿性、速度等都极差，每次使用都需要借助第三方软件进行检查是否可用，严重影响效率，根本满足不了任务的需求。

收费的代理ip与免费的代理ip差距非常大，不仅拥有海量的ip资源，可用率、高匿性、速度都是极好。操作简单工作效率既然提高上去了。经过多家的测试，最终选择了性价比最高的万变ip。高质量的优质代理ip才可以真正用来防止爬虫被封锁，如果使用普通代理，爬虫的真实IP还是会暴露。新获取一批新IP 查看全部

在爬虫采集数据过程中，经常会遇到ip被限制，这也是常见的问题。因为网站都会根据ip访问的频率进行判断封锁ip,这是反爬虫机制的策略，如果拥有大量的ip资源，自然会突破ip限制的反爬虫策略。

曾经有尝试过使用免费的代理ip来搭建代理池，可是免费的代理ip不仅资源少，而且可用率、高匿性、速度等都极差，每次使用都需要借助第三方软件进行检查是否可用，严重影响效率，根本满足不了任务的需求。

收费的代理ip与免费的代理ip差距非常大，不仅拥有海量的ip资源，可用率、高匿性、速度都是极好。操作简单工作效率既然提高上去了。经过多家的测试，最终选择了性价比最高的万变ip。高质量的优质代理ip才可以真正用来防止爬虫被封锁，如果使用普通代理，爬虫的真实IP还是会暴露。新获取一批新IP

Python爬虫学习者需要注意什么？

python爬虫 • wanbainip 发表了文章 • 0 个评论 • 2412 次浏览 • 2020-10-28 17:14 • 来自相关话题

在这个大数据时代里，学习Python网络爬虫的人越来越多，在学习过程中难免会遇到一些问题，学习者最常见的问题就是遇到ip被限制，因为你在爬虫采集数据过程中，同一个ip频繁的对网站进行访问，会给对方服务器会造成压力，那么网站就根据ip访问的频率来限制你的ip，这也是反爬虫机制常见的一种判断。

最常见的解决方法就是使用大量的ip，就是借着代理ip保证IP被封时有替换IP可用，永远保持着续航能力。这里推荐51代理ip，作为一家提供代理IP的专业服务商，万变ip代理拥有强大的技术团队运营维护，全高匿系统所产生的高匿ip不仅安全稳定、而且速度快，以及与爬虫用户多年来合作的宝贵经验，是Python爬虫首选代理IP。

Python是一种全栈计算机程序设计语言，全栈，顾名思义，应用范围广。你可能听说过很多编程语言，例如C语言，Java语言等，众所周知，这些语言都非常难学，更别说景桐使用了。而python不一样，比如完成一个Web服务，C语言要写1000行代码，Java要写100行，而python可能只要写20行。对！这就是差距！目前由于python“简单易懂”，已逐步成为网络爬虫主流语言。

在初学python爬虫时，很多程序员会被一些“小问题”阻碍脚步，为避免大家再次犯同样的错误，加快学习进程，在爬取网站信息时一定要使用大量代理IP。好用的代理IP服务商，

高效率的爬虫工作离不开ip代理的支持，这就是ip代理越来越受欢迎的原因！收藏举报投诉查看全部

在这个大数据时代里，学习Python网络爬虫的人越来越多，在学习过程中难免会遇到一些问题，学习者最常见的问题就是遇到ip被限制，因为你在爬虫采集数据过程中，同一个ip频繁的对网站进行访问，会给对方服务器会造成压力，那么网站就根据ip访问的频率来限制你的ip，这也是反爬虫机制常见的一种判断。

最常见的解决方法就是使用大量的ip，就是借着代理ip保证IP被封时有替换IP可用，永远保持着续航能力。这里推荐51代理ip，作为一家提供代理IP的专业服务商，万变ip代理拥有强大的技术团队运营维护，全高匿系统所产生的高匿ip不仅安全稳定、而且速度快，以及与爬虫用户多年来合作的宝贵经验，是Python爬虫首选代理IP。

Python是一种全栈计算机程序设计语言，全栈，顾名思义，应用范围广。你可能听说过很多编程语言，例如C语言，Java语言等，众所周知，这些语言都非常难学，更别说景桐使用了。而python不一样，比如完成一个Web服务，C语言要写1000行代码，Java要写100行，而python可能只要写20行。对！这就是差距！目前由于python“简单易懂”，已逐步成为网络爬虫主流语言。

在初学python爬虫时，很多程序员会被一些“小问题”阻碍脚步，为避免大家再次犯同样的错误，加快学习进程，在爬取网站信息时一定要使用大量代理IP。好用的代理IP服务商，

高效率的爬虫工作离不开ip代理的支持，这就是ip代理越来越受欢迎的原因！收藏举报投诉

Python爬虫虎牙平台主播的图片代码

python爬虫 • wanbainip 发表了文章 • 0 个评论 • 2467 次浏览 • 2020-10-27 17:55 • 来自相关话题

今天万变ip就为大家分享一下，Python爬虫是如何爬取虎牙平台的主播图片。这里我们主要爬取虎牙星秀的主播图片，并下载保存到本地。在爬取之前，我们可先打开目标页面，F12审查元素，查找我们需要的信息所在的位置。

代码如下：

import urllib.request

import re

import os

# 全局变量用来记录图片的编号

gl_z = 0

def down_img(url1):

"""下载图片"""

# 处理图片链接,拼接http:

url = "https:" + re.sub(r"\?", "", url1)

global gl_z

print(url)

# 请求链接

response = urllib.request.urlopen(url)

# 读取内容

data = response.read()

# 切片取出图片名称

file_name = url[url.rfind('/') + 1:]

# 生成列表

a = [x for x in range(10000)]

# 打开文件用以写入

file = open(os.path.join("photo3", "img" + file_name + str(a[gl_z]) + ".jpg"), "wb")

file.write(data)

# 关闭文件

file.close()

# 编号加1

gl_z += 1

if __name__ == '__main__':

# 要抓去信息的网址

home = """http://www.huya.com/g/xingxiu"""

# 模拟请求头

headers = {

"Host": "www.huya.com",

"User-Agent": "agent信息"

}

# 构造好请求对象将请求提交到服务器获取的响应就是到首页的html代码

request = urllib.request.Request(url=home, headers=headers)

response = urllib.request.urlopen(request)

# 读取抓到的内容并解码

html_data = response.read().decode()

"""huyaimg.msstatic.com/avatar/1054/db/6590aa9bcf98e12e5d809d371e46cc_180_135.jpg

"""

# 使用正则从首页中提取出所有的图片链接

img_list = re.findall(r"//huyaimg\.msstatic\.com.+\.jpg\?", html_data)

print(img_list)

# 取出每张图片进行下载

for img_url in img_list:

print(img_url)

down_img(img_url) 查看全部

今天万变ip就为大家分享一下，Python爬虫是如何爬取虎牙平台的主播图片。这里我们主要爬取虎牙星秀的主播图片，并下载保存到本地。在爬取之前，我们可先打开目标页面，F12审查元素，查找我们需要的信息所在的位置。

代码如下：

import urllib.request

import re

import os

# 全局变量用来记录图片的编号

gl_z = 0

def down_img(url1):

"""下载图片"""

# 处理图片链接,拼接http:

url = "https:" + re.sub(r"\?", "", url1)

global gl_z

print(url)

# 请求链接

response = urllib.request.urlopen(url)

# 读取内容

data = response.read()

# 切片取出图片名称

file_name = url[url.rfind('/') + 1:]

# 生成列表

a = [x for x in range(10000)]

# 打开文件用以写入

file = open(os.path.join("photo3", "img" + file_name + str(a[gl_z]) + ".jpg"), "wb")

file.write(data)

# 关闭文件

file.close()

# 编号加1

gl_z += 1

if __name__ == '__main__':

# 要抓去信息的网址

home = """http://www.huya.com/g/xingxiu"""

# 模拟请求头

headers = {

"Host": "www.huya.com",

"User-Agent": "agent信息"

}

# 构造好请求对象将请求提交到服务器获取的响应就是到首页的html代码

request = urllib.request.Request(url=home, headers=headers)

response = urllib.request.urlopen(request)

# 读取抓到的内容并解码

html_data = response.read().decode()

"""huyaimg.msstatic.com/avatar/1054/db/6590aa9bcf98e12e5d809d371e46cc_180_135.jpg

"""

# 使用正则从首页中提取出所有的图片链接

img_list = re.findall(r"//huyaimg\.msstatic\.com.+\.jpg\?", html_data)

print(img_list)

# 取出每张图片进行下载

for img_url in img_list:

print(img_url)

down_img(img_url)

python打造3D撞球小游戏

wanbainip 发表了文章 • 0 个评论 • 2755 次浏览 • 2020-10-26 15:52 • 来自相关话题

万变ip给大家分享一下制作炫酷动画所需的全部代码。

我们需要三组刚体（当您在Blender的对象上打开一个刚体的属性时，Blender将模拟与其它刚体的碰撞）：

1.平面

第2行代码创建了一个简单的平面，立方体将放置在该平面上。为了防止它因重力而坠落，我们将其设为“受体”第4行代码。

2. 圆环

第11-12行将第一个圆环的"Enabled"属性设置为false，防止由于重力而坠落。这样它就固定在那牵住整个链条。

3. 立方体

因为z循环第13行嵌套在x循环[第5行]中，我们将得到一个18X10的立方体组成的墙。查看全部

万变ip给大家分享一下制作炫酷动画所需的全部代码。

我们需要三组刚体（当您在Blender的对象上打开一个刚体的属性时，Blender将模拟与其它刚体的碰撞）：

1.平面

第2行代码创建了一个简单的平面，立方体将放置在该平面上。为了防止它因重力而坠落，我们将其设为“受体”第4行代码。

2. 圆环

第11-12行将第一个圆环的"Enabled"属性设置为false，防止由于重力而坠落。这样它就固定在那牵住整个链条。

3. 立方体

因为z循环第13行嵌套在x循环[第5行]中，我们将得到一个18X10的立方体组成的墙。

Python爬虫基本框架

python爬虫 • wanbainip 发表了文章 • 0 个评论 • 2236 次浏览 • 2020-10-25 18:01 • 来自相关话题

基本爬虫框架包括五个模块，万变ip来告诉你：

1. 爬虫调度器负责统筹其他四个模块协调工作。

2. URL管理器负责管理URL链接，包括已爬取的链接和未爬取的链接。

3. HTML下载器用于从URL管理器中获取未爬取的链接并下载其HTML网页。

4. HTML解析器用于解析HTML下载器下载的HTML网页，获取URL链接交给URL管理器，提取要获取的数据交给数据存储器。

5. 数据存储器用于将HTML解析器解析出来的数据存储到数据库或文件。查看全部

基本爬虫框架包括五个模块，万变ip来告诉你：

1. 爬虫调度器负责统筹其他四个模块协调工作。

2. URL管理器负责管理URL链接，包括已爬取的链接和未爬取的链接。

3. HTML下载器用于从URL管理器中获取未爬取的链接并下载其HTML网页。

4. HTML解析器用于解析HTML下载器下载的HTML网页，获取URL链接交给URL管理器，提取要获取的数据交给数据存储器。

5. 数据存储器用于将HTML解析器解析出来的数据存储到数据库或文件。

Python爬虫如何防止ip被封？

python爬虫 • wanbainip 发表了文章 • 0 个评论 • 2493 次浏览 • 2020-10-24 10:22 • 来自相关话题

Python爬虫采集数据过程中，经常遇到网站的反爬虫机制，当相同的ip频繁的对服务器进行访问，就很容易出现ip被限制的，这就是常见的一种反爬虫策略，ip被限制会影响工作的进度甚至无法再继续工作，那么如何去避免呢？

第一种：降低访问的速度，我们可以使用 time模块中的sleep，使程序每运行一次后就睡眠1s，这样可以很有效的降低ip被封机率，但是效率效果不是很高，一般是用于量小的采集任务。

第二种：使用类似万变ip代理这样的优质换ip软件，这也是爬虫工作者最常用的手段之一，通过代理ip来伪装我们的ip,隐藏本地真实的ip地址，让目标服务器无法识别是相同ip发出的请求，这样就很有效的防止ip被封。突破了ip的限制，采集数据的任务就会顺利，工作效率自然会提高！

查看全部

Python爬虫采集数据过程中，经常遇到网站的反爬虫机制，当相同的ip频繁的对服务器进行访问，就很容易出现ip被限制的，这就是常见的一种反爬虫策略，ip被限制会影响工作的进度甚至无法再继续工作，那么如何去避免呢？

第一种：降低访问的速度，我们可以使用 time模块中的sleep，使程序每运行一次后就睡眠1s，这样可以很有效的降低ip被封机率，但是效率效果不是很高，一般是用于量小的采集任务。

第二种：使用类似万变ip代理这样的优质换ip软件，这也是爬虫工作者最常用的手段之一，通过代理ip来伪装我们的ip,隐藏本地真实的ip地址，让目标服务器无法识别是相同ip发出的请求，这样就很有效的防止ip被封。突破了ip的限制，采集数据的任务就会顺利，工作效率自然会提高！

pipreqs python3 不兼容问题解决

李魔佛发表了文章 • 0 个评论 • 3786 次浏览 • 2020-10-21 16:02 • 来自相关话题

这里面可能会报几个错误：

1. 如果报错信息是：
如果提示“UnicodeDecodeError: 'gbk' codec can't decode ”的错误，需要指定字符集 --encoding=utf8

pipreqs ./ --encoding=utf8

2. 出现诸如：Traceback (most recent call last):
File "c:\anaconda\envs\py37\lib\runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "c:\anaconda\envs\py37\lib\runpy.py", line 85, in _run_code
exec(code, run_globals)
File "C:\anaconda\envs\py37\Scripts\pipreqs.exe\__main__.py", line 7, in <module>
File "c:\anaconda\envs\py37\lib\site-packages\pipreqs\pipreqs.py", line 470, in main
init(args)
File "c:\anaconda\envs\py37\lib\site-packages\pipreqs\pipreqs.py", line 409, in init
follow_links=follow_links)
File "c:\anaconda\envs\py37\lib\site-packages\pipreqs\pipreqs.py", line 138, in get_all_imports
raise exc
File "c:\anaconda\envs\py37\lib\site-packages\pipreqs\pipreqs.py", line 124, in get_all_imports
tree = ast.parse(contents)
File "c:\anaconda\envs\py37\lib\ast.py", line 35, in parse
return compile(source, filename, mode, PyCF_ONLY_AST)
File "<unknown>", line 162
except Exception ,e:
^
SyntaxError: invalid syntax
明显的python2的语法报错，这时很可能是你的项目中有python2语法的py文件，可以使用
--ignore <dirs> ...忽略额外的目录

参数解决。

原文链接：
http://30daydo.com/article/619
转载请注明出处查看全部

这里面可能会报几个错误：

1. 如果报错信息是：
如果提示“UnicodeDecodeError: 'gbk' codec can't decode ”的错误，需要指定字符集 --encoding=utf8

pipreqs ./ --encoding=utf8

2. 出现诸如：

Traceback (most recent call last):

  File "c:\anaconda\envs\py37\lib\runpy.py", line 193, in _run_module_as_main

    "__main__", mod_spec)

  File "c:\anaconda\envs\py37\lib\runpy.py", line 85, in _run_code

    exec(code, run_globals)

  File "C:\anaconda\envs\py37\Scripts\pipreqs.exe\__main__.py", line 7, in <module>

  File "c:\anaconda\envs\py37\lib\site-packages\pipreqs\pipreqs.py", line 470, in main

    init(args)

  File "c:\anaconda\envs\py37\lib\site-packages\pipreqs\pipreqs.py", line 409, in init

    follow_links=follow_links)

  File "c:\anaconda\envs\py37\lib\site-packages\pipreqs\pipreqs.py", line 138, in get_all_imports

    raise exc

  File "c:\anaconda\envs\py37\lib\site-packages\pipreqs\pipreqs.py", line 124, in get_all_imports

    tree = ast.parse(contents)

  File "c:\anaconda\envs\py37\lib\ast.py", line 35, in parse

    return compile(source, filename, mode, PyCF_ONLY_AST)

  File "<unknown>", line 162

    except Exception ,e:

                     ^

SyntaxError: invalid syntax

明显的python2的语法报错，这时很可能是你的项目中有python2语法的py文件，可以使用
--ignore <dirs> ...忽略额外的目录

参数解决。

原文链接：
http://30daydo.com/article/619
转载请注明出处

pycharm实用插件

李魔佛发表了文章 • 0 个评论 • 2447 次浏览 • 2020-10-17 02:25 • 来自相关话题

1. rainbow bracket
括号对变成彩色的
2. kite
人工智能预测输入提示：根据机器学习，预测你下一个输入函数的名

asyncio中get_running_loop和get_event_loop的区别

李魔佛发表了文章 • 0 个评论 • 7736 次浏览 • 2020-10-10 17:03 • 来自相关话题

asyncio.get_running_loop()

asyncio.get_event_loop()

官方地址
https://docs.python.org/3/library/asyncio-eventloop.html?highlight=get_running_loop#asyncio.get_running_loop

get_running_loop() 是python3.7之后新增的函数，用于获取当前正在运行的loop，如果当前主线程中没有正在运行的loop，如果没有就会报RuntimeError 错误。

并且get_running_loop 要在协程里面用，用来捕获当前的loop。

示例用法： 1 # -*- coding: utf-8 -*-
2
3 import asyncio
4
5 async def main():
6 await asyncio.sleep(10)
7 print('Done')
8 myloop = asyncio.get_running_loop()
9 print('current loop ')
10 print(id(myloop))
11
12 # loop = asyncio.get_running_loop()
13 loop = asyncio.get_event_loop()
14 print('current loop id')
15 print(id(loop))
16
17 # print(id(myloop))
18 try:
19 loop.run_until_complete(main())
20 except KeyboardInterrupt:
21 print('key board inpterrupt!')

运行结果：

可以看到两个loop的id是一样的，是同一个对象。
如果第十二行直接调用的话会报错。
asyncio.get_event_loop() 如果在主线程中，如果没有被设置过任何event loop （时间循环），那么会创建一个时间循环，并返回。
查看全部

asyncio.get_running_loop()

asyncio.get_event_loop()

官方地址
https://docs.python.org/3/library/asyncio-eventloop.html?highlight=get_running_loop#asyncio.get_running_loop

get_running_loop() 是python3.7之后新增的函数，用于获取当前正在运行的loop，如果当前主线程中没有正在运行的loop，如果没有就会报RuntimeError 错误。

并且get_running_loop 要在协程里面用，用来捕获当前的loop。

示例用法：

  1 # -*- coding: utf-8 -*-

  2              

  3 import asyncio

  4               

  5 async def main():

  6     await asyncio.sleep(10)

  7     print('Done')

  8     myloop = asyncio.get_running_loop()

  9     print('current loop ')

 10     print(id(myloop))

 11                                                                             

 12 # loop = asyncio.get_running_loop()

 13 loop = asyncio.get_event_loop()

 14 print('current loop id')

 15 print(id(loop))

 16              

 17 # print(id(myloop))

 18 try:         

 19     loop.run_until_complete(main())

 20 except KeyboardInterrupt:

 21     print('key board inpterrupt!')

运行结果：

可以看到两个loop的id是一样的，是同一个对象。
如果第十二行直接调用的话会报错。
asyncio.get_event_loop() 如果在主线程中，如果没有被设置过任何event loop （时间循环），那么会创建一个时间循环，并返回。

愿意付费购买商超商品条形码对应的商品图片，有哪位大神可以帮帮忙啊？+V15032219667

python爬虫 • heikekang 发起了问题 • 2 人关注 • 0 个回复 • 2894 次浏览 • 2020-09-25 19:10 • 来自相关话题

python asyncio aiohttp motor异步爬虫例子定时抓取bilibili首页热度网红

python爬虫 • 李魔佛发表了文章 • 0 个评论 • 3180 次浏览 • 2020-09-22 22:35 • 来自相关话题

使用的异步库： aiohttp【http异步库】，motor【mongo异步库】

AsyncIOMotorClient(connect_uri)
motor连接带用户名和密码的方法和pymongo一致。

connect_uri = f'mongodb://{user}:{password}@{host}:{port}'# -*- coding: utf-8 -*-
# website: http://30daydo.com
# @Time : 2020/9/22 10:07
# @File : bilibili_hot_anchor.py

# 异步爬取首页与列表

import asyncio
import datetime
import aiohttp
import re
import time
from motor.motor_asyncio import AsyncIOMotorClient
from parsel import Selector
from settings import _json_data

SLEEP = 60 * 10
INFO = _json_data['mongo']['arm']
host = INFO['host']
port = INFO['port']
user = INFO['user']
password = INFO['password']
connect_uri = f'mongodb://{user}:{password}@{host}:{port}'

client = AsyncIOMotorClient(connect_uri)

db = client['db_parker']

home_url = 'https://www.bilibili.com/ranking'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2'}

def convertor(number_str):
'''
将小数点的万变为整数
:param number_str:
:return:
'''
number = re.search('(\d+\.+\d+)', number_str)
if number:
number = float(number.group(1))
if re.search('万', number_str):
number = int(number * 10000)
else:
number = 0

return number

async def home_page():
async with aiohttp.ClientSession() as session:
while True:
start = time.time()
async with session.get(url=home_url, headers=headers) as response:
html = await response.text()
resp = Selector(text=html)
items = resp.xpath('//ul[@class="rank-list"]/li')
for item in items:
json_data = {}
number = item.xpath('.//div[@class="num"]/text()').extract_first()
info = item.xpath('.//div[@class="info"][1]')
title = info.xpath('.//a/text()').extract_first()

detail_url = info.xpath('.//a/@href').extract_first()
play_number = info.xpath('.//div[@class="detail"]/span[1]/text()').extract_first()
viewing_number = info.xpath('.//div[@class="detail"]/span[2]/text()').extract_first()

json_data['number'] = int(number)
json_data['title'] = title
json_data['play_number'] = convertor(play_number)
json_data['viewing_number'] = convertor(viewing_number)

json_data['url'] = detail_url

task = asyncio.create_task(detail_list(session, detail_url, json_data))
# await detail_url()
end = time.time()
print(f'time used {end-start}')
await asyncio.sleep(SLEEP) # 暂停10分钟
print(f'sleep for {SLEEP}')

async def detail_list(session, url, json_data):
async with session.get(url, headers=headers) as response:
response = await response.text()
await parse_detail(response, json_data)

async def parse_detail(html, json_data=None):
resp = Selector(text=html)
info = resp.xpath('//div[@id="v_desc"]/div[@class="info open"]/text()').extract_first()

if not info:
info = '这个家伙很懒'

json_data['info'] = info.strip()
current = datetime.datetime.now()
json_data['crawltime'] = current
await db['bilibili'].update_one({'url': json_data['url']}, {'$set': json_data}, True, True)

loop = asyncio.get_event_loop()
loop.run_until_complete(home_page())

爬取的数据图：

原创文章，转载请注明：http://30daydo.com/article/605
需要源码，可私信。

查看全部

使用的异步库： aiohttp【http异步库】，motor【mongo异步库】

AsyncIOMotorClient(connect_uri)
motor连接带用户名和密码的方法和pymongo一致。

connect_uri = f'mongodb://{user}:{password}@{host}:{port}'

# -*- coding: utf-8 -*-

# website: http://30daydo.com

# @Time : 2020/9/22 10:07

# @File : bilibili_hot_anchor.py



# 异步爬取首页与列表



import asyncio

import datetime

import aiohttp

import re

import time

from motor.motor_asyncio import AsyncIOMotorClient

from parsel import Selector

from settings import _json_data



SLEEP = 60 * 10

INFO = _json_data['mongo']['arm']

host = INFO['host']

port = INFO['port']

user = INFO['user']

password = INFO['password']

connect_uri = f'mongodb://{user}:{password}@{host}:{port}'



client = AsyncIOMotorClient(connect_uri)



db = client['db_parker']



home_url = 'https://www.bilibili.com/ranking'

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0',

           'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2'}





def convertor(number_str):

    '''

    将小数点的万变为整数

    :param number_str:

    :return:

    '''

    number = re.search('(\d+\.+\d+)', number_str)

    if number:

        number = float(number.group(1))

        if re.search('万', number_str):

            number = int(number * 10000)

    else:

        number = 0



    return number





async def home_page():

    async with aiohttp.ClientSession() as session:

        while True:

            start = time.time()

            async with session.get(url=home_url, headers=headers) as response:

                html = await response.text()

                resp = Selector(text=html)

                items = resp.xpath('//ul[@class="rank-list"]/li')

                for item in items:

                    json_data = {}

                    number = item.xpath('.//div[@class="num"]/text()').extract_first()

                    info = item.xpath('.//div[@class="info"][1]')

                    title = info.xpath('.//a/text()').extract_first()



                    detail_url = info.xpath('.//a/@href').extract_first()

                    play_number = info.xpath('.//div[@class="detail"]/span[1]/text()').extract_first()

                    viewing_number = info.xpath('.//div[@class="detail"]/span[2]/text()').extract_first()



                    json_data['number'] = int(number)

                    json_data['title'] = title

                    json_data['play_number'] = convertor(play_number)

                    json_data['viewing_number'] = convertor(viewing_number)



                    json_data['url'] = detail_url



                    task = asyncio.create_task(detail_list(session, detail_url, json_data))

                    # await detail_url()

                end = time.time()

                print(f'time used {end-start}')

                await asyncio.sleep(SLEEP)  # 暂停10分钟

                print(f'sleep for {SLEEP}')





async def detail_list(session, url, json_data):

    async with session.get(url, headers=headers) as response:

        response = await response.text()

        await parse_detail(response, json_data)





async def parse_detail(html, json_data=None):

    resp = Selector(text=html)

    info = resp.xpath('//div[@id="v_desc"]/div[@class="info open"]/text()').extract_first()



    if not info:

        info = '这个家伙很懒'



    json_data['info'] = info.strip()

    current = datetime.datetime.now()

    json_data['crawltime'] = current

    await db['bilibili'].update_one({'url': json_data['url']}, {'$set': json_data}, True, True)





loop = asyncio.get_event_loop()

loop.run_until_complete(home_page())

爬取的数据图：

原创文章，转载请注明：http://30daydo.com/article/605
需要源码，可私信。

python3.8海象运算符

李魔佛发表了文章 • 0 个评论 • 2113 次浏览 • 2020-09-08 23:13 • 来自相关话题

海象运算符（ := ）

这个「:=」横过来看是不是有点像海象的脸？这是一个新的 Python 语法，可以在进行条件判断时直接为变量赋值。

过去我们需要首先对某个变量进行赋值，然后进行条件判断。m = re.match(p1, line)
if m:
return m.group(1)
else:
m = re.match(p2, line)
if m:
return m.group(2)
else:
m = re.match(p3, line)
...
而使用海象运算符后，我们可以直接为变量赋值：
if m := re.match(p1, line):
return m.group(1)
elif m := re.match(p2, line):
return m.group(2)
elif m := re.match(p3, line):
PS：
python的版本更新最喜欢搞这一类的小动作，仅仅为了节省那么一两行代码弄得代码无法向下兼容。

查看全部

海象运算符（ := ）

这个「:=」横过来看是不是有点像海象的脸？这是一个新的 Python 语法，可以在进行条件判断时直接为变量赋值。

过去我们需要首先对某个变量进行赋值，然后进行条件判断。

m = re.match(p1, line)

if m:

return m.group(1)

else:

m = re.match(p2, line)

if m:

return m.group(2)

else:

m = re.match(p3, line)

    ...

而使用海象运算符后，我们可以直接为变量赋值：

if m := re.match(p1, line):

return m.group(1)

elif m := re.match(p2, line):

return m.group(2)

elif m := re.match(p3, line):

PS：
python的版本更新最喜欢搞这一类的小动作，仅仅为了节省那么一两行代码弄得代码无法向下兼容。

python pyexecjs执行含有中文字符的js脚本报错

李魔佛发表了文章 • 0 个评论 • 4387 次浏览 • 2020-08-25 10:51 • 来自相关话题

报错信息如下：
File "C:\ProgramData\Anaconda3\lib\threading.py", line 926, in _bootstrap_inner
self.run()
File "C:\ProgramData\Anaconda3\lib\threading.py", line 870, in run
self._target(*self._args, **self._kwargs)
File "C:\ProgramData\Anaconda3\lib\subprocess.py", line 1238, in _readerthread
buffer.append(fh.read())
UnicodeDecodeError: 'gbk' codec can't decode byte 0xbd in position 52: illegal multibyte sequence

File "C:\ProgramData\Anaconda3\lib\site-packages\execjs\_external_runtime.py", line 103, in _exec_with_pipe
stdoutdata, stderrdata = p.communicate(input=input)
File "C:\ProgramData\Anaconda3\lib\subprocess.py", line 939, in communicate
stdout, stderr = self._communicate(input, endtime, timeout)
File "C:\ProgramData\Anaconda3\lib\subprocess.py", line 1288, in _communicate
stdout = stdout[0]
IndexError: list index out of range
使用nodejs直接执行js是没有问题，同样代码在linux上执行也没有问题。
原因是windows的默认编码为cp396，修改subprocess.py文件的默认编码就可以解决。

def __init__(self, args, bufsize=-1, executable=None,
stdin=None, stdout=None, stderr=None,
preexec_fn=None, close_fds=True,
shell=False, cwd=None, env=None, universal_newlines=None,
startupinfo=None, creationflags=0,
restore_signals=True, start_new_session=False,
pass_fds=(), *, encoding="None", errors=None, text=None):
把上面的encoding=None改为 encoding="utf-8"，就可以了。查看全部

报错信息如下：

  File "C:\ProgramData\Anaconda3\lib\threading.py", line 926, in _bootstrap_inner

    self.run()

  File "C:\ProgramData\Anaconda3\lib\threading.py", line 870, in run

    self._target(*self._args, **self._kwargs)

  File "C:\ProgramData\Anaconda3\lib\subprocess.py", line 1238, in _readerthread

    buffer.append(fh.read())

UnicodeDecodeError: 'gbk' codec can't decode byte 0xbd in position 52: illegal multibyte sequence

  File "C:\ProgramData\Anaconda3\lib\site-packages\execjs\_external_runtime.py", line 103, in _exec_with_pipe

    stdoutdata, stderrdata = p.communicate(input=input)

  File "C:\ProgramData\Anaconda3\lib\subprocess.py", line 939, in communicate

    stdout, stderr = self._communicate(input, endtime, timeout)

  File "C:\ProgramData\Anaconda3\lib\subprocess.py", line 1288, in _communicate

    stdout = stdout[0]

IndexError: list index out of range

使用nodejs直接执行js是没有问题，同样代码在linux上执行也没有问题。
原因是windows的默认编码为cp396，修改subprocess.py文件的默认编码就可以解决。

    def __init__(self, args, bufsize=-1, executable=None,

                 stdin=None, stdout=None, stderr=None,

                 preexec_fn=None, close_fds=True,

                 shell=False, cwd=None, env=None, universal_newlines=None,

                 startupinfo=None, creationflags=0,

                 restore_signals=True, start_new_session=False,

                 pass_fds=(), *, encoding="None", errors=None, text=None):

把上面的encoding=None改为 encoding="utf-8"，就可以了。

爬虫nike登录流程抓包分析

python爬虫 • 李魔佛发表了文章 • 0 个评论 • 2948 次浏览 • 2020-08-15 23:52 • 来自相关话题

<占坑> 敬请期待。

深圳住房公积金验证码识别破解

python爬虫 • 李魔佛发表了文章 • 0 个评论 • 2686 次浏览 • 2020-06-26 14:34 • 来自相关话题

http://gjj.sz.gov.cn/fzgn/zfcq/index.html

比较常规的验证码，使用keras全连接层，cv切割后每个字符只需要20个样本就达到准确率99%。
需要模型或者代码的私聊。查看全部

http://gjj.sz.gov.cn/fzgn/zfcq/index.html

比较常规的验证码，使用keras全连接层，cv切割后每个字符只需要20个样本就达到准确率99%。
需要模型或者代码的私聊。

PyQt5自定义控件

李魔佛发表了文章 • 0 个评论 • 3082 次浏览 • 2020-06-13 23:14 • 来自相关话题

PyQt5包含种类丰富的控件。但能满足所有需求的控件库是不存在的。通常控件库只提供了像按钮、文本控件、滑块等最常用的控件。但如果需要某种特殊的控件，我们只能自己动手来实现。自定义控件需要使用工具库提供的绘图工具，可能有两种方式：在已有的控件上进行拓展或从头开始创建自定义控件。

Burning widget(烧录控件)
这个控件可能会在Nero，K3B或其他CD/DVD烧录软件中见到。

# -*- coding: utf-8 -*-

"""
PyQt5 tutorial

In this example, we create a custom widget.
"""
import sys
from PyQt5.QtWidgets import (QWidget, QSlider, QApplication,
QHBoxLayout, QVBoxLayout)
from PyQt5.QtCore import QObject, Qt, pyqtSignal
from PyQt5.QtGui import QPainter, QFont, QColor, QPen

class Communicate(QObject):
updateBW = pyqtSignal(int)

class BurningWidget(QWidget):
def __init__(self):
super().__init__()

self.initUI()

def initUI(self):

self.setMinimumSize(1, 30)
self.value = 75
self.num = [75, 150, 225, 300, 375, 450, 525, 600, 675]

def setValue(self, value):

self.value = value

def paintEvent(self, e):

qp = QPainter()
qp.begin(self)
self.drawWidget(qp)
qp.end()

def drawWidget(self, qp):

font = QFont('Serif', 7, QFont.Light)
qp.setFont(font)

size = self.size()
w = size.width()
h = size.height()

step = int(round(w / 10.0))

till = int(((w / 750.0) * self.value))
full = int(((w / 750.0) * 700))

if self.value >= 700:

qp.setPen(QColor(255, 255, 255))
qp.setBrush(QColor(255, 255, 184))
qp.drawRect(0, 0, full, h)
qp.setPen(QColor(255, 175, 175))
qp.setBrush(QColor(255, 175, 175))
qp.drawRect(full, 0, till - full, h)

else:

qp.setPen(QColor(255, 255, 255))
qp.setBrush(QColor(255, 255, 184))
qp.drawRect(0, 0, till, h)

pen = QPen(QColor(20, 20, 20), 1,
Qt.SolidLine)

qp.setPen(pen)
qp.setBrush(Qt.NoBrush)
qp.drawRect(0, 0, w - 1, h - 1)

j = 0

for i in range(step, 10 * step, step):
qp.drawLine(i, 0, i, 5)
metrics = qp.fontMetrics()
fw = metrics.width(str(self.num[j]))
qp.drawText(i - fw / 2, h / 2, str(self.num[j]))
j = j + 1

class Example(QWidget):
def __init__(self):
super().__init__()

self.initUI()

def initUI(self):
sld = QSlider(Qt.Horizontal, self)
sld.setFocusPolicy(Qt.NoFocus)
sld.setRange(1, 750)
sld.setValue(75)
sld.setGeometry(30, 40, 150, 30)

self.c = Communicate()
self.wid = BurningWidget()
self.c.updateBW[int].connect(self.wid.setValue)

sld.valueChanged[int].connect(self.changeValue)
hbox = QHBoxLayout()
hbox.addWidget(self.wid)
vbox = QVBoxLayout()
vbox.addStretch(1)
vbox.addLayout(hbox)
self.setLayout(vbox)

self.setGeometry(300, 300, 390, 210)
self.setWindowTitle('Burning widget')
self.show()

def changeValue(self, value):
self.c.updateBW.emit(value)
self.wid.repaint()

if __name__ == '__main__':
app = QApplication(sys.argv)
ex = Example()
sys.exit(app.exec_())
在示例中我们使用了滑块与一个自定义控件。自定义控件受滑块控制。控件显示了媒体介质的容量和剩余空间。该控件的最小值为1,最大值为750。在值超过700时颜色变为红色。这通常意味着超刻(即实际写入光盘的容量超过刻录盘片官方标称容量的一种操作)。

BurningWidget控件通过QHBoxLayout与QVBoxLayout置于窗体的底部。
class BurningWidget(QWidget):

def __init__(self):
super().__init__()

烧录的控件,它基于QWidget

self.setMinimumSize(1, 30)我们改变了控件的最小大小(高度),默认值为有点小。
font = QFont('Serif', 7, QFont.Light)
qp.setFont(font)我们使用一个比默认要小的字体。
size = self.size()
w = size.width()
h = size.height()

step = int(round(w / 10.0))

till = int(((w / 750.0) * self.value))
full = int(((w / 750.0) * 700))
控件采用了动态绘制技术。窗体越大，控件也随之变大；反之亦然。这也是我们需要计算自定义控件的载体控件(即窗体)尺寸的原因。till参数定义了需要绘制的总尺寸，它根据slider控件计算得出，是整体区域的比例值。full参数定义了红色区域的绘制起点。注意在绘制时为取得较大精度而使用的浮点数运算。

实际的绘制分三个步骤。黄色或红黄矩形的绘制，然后是刻度线的绘制，最后是刻度值的绘制。

metrics = qp.fontMetrics()
fw = metrics.width(str(self.num[j]))
qp.drawText(i-fw/2, h/2, str(self.num[j]))我们使用字体度量来绘制文本。我们必须知道文本的宽度,以中心垂直线。
def changeValue(self, value):

self.c.updateBW.emit(value)
self.wid.repaint()当滑块发生移动时，changeValue()方法会被调用。在方法内我们触发了一个自定义的updateBW信号，其参数是当前滚动条的值。该值被用于计算Burning widget的容量值。然后对控件进行重绘。

查看全部

PyQt5包含种类丰富的控件。但能满足所有需求的控件库是不存在的。通常控件库只提供了像按钮、文本控件、滑块等最常用的控件。但如果需要某种特殊的控件，我们只能自己动手来实现。自定义控件需要使用工具库提供的绘图工具，可能有两种方式：在已有的控件上进行拓展或从头开始创建自定义控件。

Burning widget(烧录控件)
这个控件可能会在Nero，K3B或其他CD/DVD烧录软件中见到。

# -*- coding: utf-8 -*-

 

"""

PyQt5 tutorial

 

In this example, we create a custom widget.

"""

import sys

from PyQt5.QtWidgets import (QWidget, QSlider, QApplication,

                             QHBoxLayout, QVBoxLayout)

from PyQt5.QtCore import QObject, Qt, pyqtSignal

from PyQt5.QtGui import QPainter, QFont, QColor, QPen

 

 

class Communicate(QObject):

    updateBW = pyqtSignal(int)

 

 

class BurningWidget(QWidget):

    def __init__(self):

        super().__init__()

 

        self.initUI()

 

    def initUI(self):

 

        self.setMinimumSize(1, 30)

        self.value = 75

        self.num = [75, 150, 225, 300, 375, 450, 525, 600, 675]

 

    def setValue(self, value):

 

        self.value = value

 

    def paintEvent(self, e):

 

        qp = QPainter()

        qp.begin(self)

        self.drawWidget(qp)

        qp.end()

 

    def drawWidget(self, qp):

 

        font = QFont('Serif', 7, QFont.Light)

        qp.setFont(font)

 

        size = self.size()

        w = size.width()

        h = size.height()

 

        step = int(round(w / 10.0))

 

        till = int(((w / 750.0) * self.value))

        full = int(((w / 750.0) * 700))

 

        if self.value >= 700:

 

            qp.setPen(QColor(255, 255, 255))

            qp.setBrush(QColor(255, 255, 184))

            qp.drawRect(0, 0, full, h)

            qp.setPen(QColor(255, 175, 175))

            qp.setBrush(QColor(255, 175, 175))

            qp.drawRect(full, 0, till - full, h)

 

        else:

 

            qp.setPen(QColor(255, 255, 255))

            qp.setBrush(QColor(255, 255, 184))

            qp.drawRect(0, 0, till, h)

 

        pen = QPen(QColor(20, 20, 20), 1,

                   Qt.SolidLine)

 

        qp.setPen(pen)

        qp.setBrush(Qt.NoBrush)

        qp.drawRect(0, 0, w - 1, h - 1)

 

        j = 0

 

        for i in range(step, 10 * step, step):

            qp.drawLine(i, 0, i, 5)

            metrics = qp.fontMetrics()

            fw = metrics.width(str(self.num[j]))

            qp.drawText(i - fw / 2, h / 2, str(self.num[j]))

            j = j + 1

 

 

class Example(QWidget):

    def __init__(self):

        super().__init__()

 

        self.initUI()

 

    def initUI(self):

        sld = QSlider(Qt.Horizontal, self)

        sld.setFocusPolicy(Qt.NoFocus)

        sld.setRange(1, 750)

        sld.setValue(75)

        sld.setGeometry(30, 40, 150, 30)

 

        self.c = Communicate()

        self.wid = BurningWidget()

        self.c.updateBW[int].connect(self.wid.setValue)

 

        sld.valueChanged[int].connect(self.changeValue)

        hbox = QHBoxLayout()

        hbox.addWidget(self.wid)

        vbox = QVBoxLayout()

        vbox.addStretch(1)

        vbox.addLayout(hbox)

        self.setLayout(vbox)

 

        self.setGeometry(300, 300, 390, 210)

        self.setWindowTitle('Burning widget')

        self.show()

 

    def changeValue(self, value):

        self.c.updateBW.emit(value)

        self.wid.repaint()

 

 

if __name__ == '__main__':

    app = QApplication(sys.argv)

    ex = Example()

    sys.exit(app.exec_())

在示例中我们使用了滑块与一个自定义控件。自定义控件受滑块控制。控件显示了媒体介质的容量和剩余空间。该控件的最小值为1,最大值为750。在值超过700时颜色变为红色。这通常意味着超刻(即实际写入光盘的容量超过刻录盘片官方标称容量的一种操作)。

BurningWidget控件通过QHBoxLayout与QVBoxLayout置于窗体的底部。

class BurningWidget(QWidget):

  

    def __init__(self):      

        super().__init__()

烧录的控件,它基于QWidget

self.setMinimumSize(1, 30)

我们改变了控件的最小大小(高度),默认值为有点小。

font = QFont('Serif', 7, QFont.Light)

qp.setFont(font)

我们使用一个比默认要小的字体。

size = self.size()

w = size.width()

h = size.height()

 

step = int(round(w / 10.0))

 

 

till = int(((w / 750.0) * self.value))

full = int(((w / 750.0) * 700))

控件采用了动态绘制技术。窗体越大，控件也随之变大；反之亦然。这也是我们需要计算自定义控件的载体控件(即窗体)尺寸的原因。till参数定义了需要绘制的总尺寸，它根据slider控件计算得出，是整体区域的比例值。full参数定义了红色区域的绘制起点。注意在绘制时为取得较大精度而使用的浮点数运算。

实际的绘制分三个步骤。黄色或红黄矩形的绘制，然后是刻度线的绘制，最后是刻度值的绘制。

metrics = qp.fontMetrics()

fw = metrics.width(str(self.num[j]))

qp.drawText(i-fw/2, h/2, str(self.num[j]))

我们使用字体度量来绘制文本。我们必须知道文本的宽度,以中心垂直线。

def changeValue(self, value):

          

    self.c.updateBW.emit(value)        

    self.wid.repaint()

当滑块发生移动时，changeValue()方法会被调用。在方法内我们触发了一个自定义的updateBW信号，其参数是当前滚动条的值。该值被用于计算Burning widget的容量值。然后对控件进行重绘。

Windows安装pyminizip

李魔佛发表了文章 • 0 个评论 • 3513 次浏览 • 2020-05-31 19:06 • 来自相关话题

python3直接安装会报错：
pip install pyminizip
电脑需要安装vc的编译库，或者在其他机子上把pyd文件拷贝到程序的当前目录。

pyqt5 QRect在哪个类

李魔佛发表了文章 • 0 个评论 • 2504 次浏览 • 2020-04-24 10:45 • 来自相关话题

最新的版本是在 QtCore里面的
from PyQt5.QtCore import Qt,QRect

最新的版本是在 QtCore里面的

from PyQt5.QtCore import Qt,QRect

薅“疫情公益”羊毛，黑产恶意爬取各大出版社电子书上万册

python爬虫 • Magiccc 发表了文章 • 0 个评论 • 2813 次浏览 • 2020-02-26 13:17 • 来自相关话题

疫情以来，所有企业都上班延期选择在线复工，在我们居家自我隔离期间，极验观察爬虫却没有消停，反而爬虫行为更加活跃且更胜往常。本周五，我们和无糖信息一起聊聊线上爬虫的“疫情”。

爬虫发送弹幕问题

python爬虫 • naythefirst 发起了问题 • 1 人关注 • 0 个回复 • 3257 次浏览 • 2020-02-26 11:28 • 来自相关话题

jieba.posseg TypeError: cannot unpack non-iterable pair object 词性分析报错

李魔佛发表了文章 • 0 个评论 • 4077 次浏览 • 2019-11-23 10:12 • 来自相关话题

词性标注的例子出现错误 'pair' object is not iterable

例子：import jieba.posseg as pseg
seg_list = pseg.cut("我爱北京天安门")
for word,flag in seg_list:
print(word)
print(flag)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-5-f105f6980f88> in <module>()
1 import jieba.posseg as pseg
2 seg_list = pseg.cut("我爱北京天安门")
----> 3 for word,flag in seg_list:
4 print(word)
5 print(flag)

TypeError: cannot unpack non-iterable pair object原因是新版本中seg_list是一个生成器，所以只能 for win seg_list然后从word中解包出来

print(w.word)

print(w.flag)

这样问题就解决了。查看全部

词性标注的例子出现错误 'pair' object is not iterable

例子：

import jieba.posseg as pseg

seg_list = pseg.cut("我爱北京天安门")

for word,flag in seg_list:

    print(word)

    print(flag)

---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-5-f105f6980f88> in <module>()

      1 import jieba.posseg as pseg

      2 seg_list = pseg.cut("我爱北京天安门")

----> 3 for word,flag in seg_list:

      4     print(word)

      5     print(flag)



TypeError: cannot unpack non-iterable pair object

原因是新版本中seg_list是一个生成器，所以只能 for win seg_list

然后从word中解包出来

print(w.word)

print(w.flag)

这样问题就解决了。

通知设置新通知