python

jupyter notebook 显示 opencv的图片

李魔佛发表了文章 • 0 个评论 • 8831 次浏览 • 2018-09-22 22:55 • 来自相关话题

import sys
import cv2
from matplotlib import pyplot as plt
import matplotlib
%matplotlib inlineimg = cv2.imread('forest.jpg')
plt.imshow(img)效果如图：

查看全部

import sys

import cv2

from matplotlib import pyplot as plt

import matplotlib

%matplotlib inline

img = cv2.imread('forest.jpg')

plt.imshow(img)

效果如图：

python爬虫集思录所有用户的帖子 scrapy写入mongodb数据库

python爬虫 • 李魔佛发表了文章 • 0 个评论 • 7111 次浏览 • 2018-09-02 21:52 • 来自相关话题

好久没更新了，把之前做的一些爬虫分享一下。不然都没有用户来了。-. -

项目采用scrapy的框架，数据写入到mongodb的数据库。整个站点爬下来大概用了半小时，数据有12w条。

项目中的主要代码如下：

主spider# -*- coding: utf-8 -*-
import re
import scrapy
from scrapy import Request, FormRequest
from jsl.items import JslItem
from jsl import config
import logging

class AllcontentSpider(scrapy.Spider):
name = 'allcontent'

headers = {
'Host': 'www.jisilu.cn', 'Connection': 'keep-alive', 'Pragma': 'no-cache',
'Cache-Control': 'no-cache', 'Accept': 'application/json,text/javascript,*/*;q=0.01',
'Origin': 'https://www.jisilu.cn', 'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
'Referer': 'https://www.jisilu.cn/login/',
'Accept-Encoding': 'gzip,deflate,br',
'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8'
}

def start_requests(self):
login_url = 'https://www.jisilu.cn/login/'
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip,deflate,br', 'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8',
'Cache-Control': 'no-cache', 'Connection': 'keep-alive',
'Host': 'www.jisilu.cn', 'Pragma': 'no-cache', 'Referer': 'https://www.jisilu.cn/',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36'}

yield Request(url=login_url, headers=headers, callback=self.login,dont_filter=True)

def login(self, response):
url = 'https://www.jisilu.cn/account/ajax/login_process/'
data = {
'return_url': 'https://www.jisilu.cn/',
'user_name': config.username,
'password': config.password,
'net_auto_login': '1',
'_post_type': 'ajax',
}

yield FormRequest(
url=url,
headers=self.headers,
formdata=data,
callback=self.parse,
dont_filter=True
)

def parse(self, response):
for i in range(1,3726):
focus_url = 'https://www.jisilu.cn/home/explore/sort_type-new__day-0__page-{}'.format(i)
yield Request(url=focus_url, headers=self.headers, callback=self.parse_page,dont_filter=True)

def parse_page(self, response):
nodes = response.xpath('//div[@class="aw-question-list"]/div')
for node in nodes:
each_url=node.xpath('.//h4/a/@href').extract_first()
yield Request(url=each_url,headers=self.headers,callback=self.parse_item,dont_filter=True)

def parse_item(self,response):
item = JslItem()
title = response.xpath('//div[@class="aw-mod-head"]/h1/text()').extract_first()
s = response.xpath('//div[@class="aw-question-detail-txt markitup-box"]').xpath('string(.)').extract_first()
ret = re.findall('(.*?)\.donate_user_avatar', s, re.S)

try:
content = ret[0].strip()
except:
content = None

createTime = response.xpath('//div[@class="aw-question-detail-meta"]/span/text()').extract_first()

resp_no = response.xpath('//div[@class="aw-mod aw-question-detail-box"]//ul/h2/text()').re_first('\d+')

url = response.url
item['title'] = title.strip()
item['content'] = content
try:
item['resp_no']=int(resp_no)
except Exception as e:
logging.warning('e')
item['resp_no']=None

item['createTime'] = createTime
item['url'] = url.strip()
resp =
for index,reply in enumerate(response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]')):
replay_user = reply.xpath('.//div[@class="pull-left aw-dynamic-topic-content"]//p/a/text()').extract_first()
rep_content = reply.xpath(
'.//div[@class="pull-left aw-dynamic-topic-content"]//div[@class="markitup-box"]/text()').extract_first()
# print rep_content
agree=reply.xpath('.//em[@class="aw-border-radius-5 aw-vote-count pull-left"]/text()').extract_first()
resp.append({replay_user.strip()+'_{}'.format(index): [int(agree),rep_content.strip()]})

item['resp'] = resp
yield item

login函数是模拟登录集思录，通过抓包就可以知道一些上传的data。
然后就是分页去抓取。逻辑很简单。

然后pipeline里面写入mongodb。import pymongo
from collections import OrderedDict
class JslPipeline(object):
def __init__(self):
self.db = pymongo.MongoClient(host='10.18.6.1',port=27017)
# self.user = u'neo牛3' # 修改为指定的用户名如毛之川，然后找到用户的id，在用户也的源码哪里可以找到比如持有封基是8132
self.collection = self.db['db_parker']['jsl']
def process_item(self, item, spider):
self.collection.insert(OrderedDict(item))
return item
抓取到的数据入库mongodb：

点击查看大图

原创文章
转载请注明出处：http://30daydo.com/publish/article/351

查看全部

好久没更新了，把之前做的一些爬虫分享一下。不然都没有用户来了。-. -

项目采用scrapy的框架，数据写入到mongodb的数据库。整个站点爬下来大概用了半小时，数据有12w条。

项目中的主要代码如下：

主spider

# -*- coding: utf-8 -*-

import re

import scrapy

from scrapy import Request, FormRequest

from jsl.items import JslItem

from jsl import config

import logging



class AllcontentSpider(scrapy.Spider):

    name = 'allcontent'



    headers = {

        'Host': 'www.jisilu.cn', 'Connection': 'keep-alive', 'Pragma': 'no-cache',

        'Cache-Control': 'no-cache', 'Accept': 'application/json,text/javascript,*/*;q=0.01',

        'Origin': 'https://www.jisilu.cn', 'X-Requested-With': 'XMLHttpRequest',

        'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36',

        'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',

        'Referer': 'https://www.jisilu.cn/login/',

        'Accept-Encoding': 'gzip,deflate,br',

        'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8'

    }



    def start_requests(self):

        login_url = 'https://www.jisilu.cn/login/'

        headers = {

            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',

            'Accept-Encoding': 'gzip,deflate,br', 'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8',

            'Cache-Control': 'no-cache', 'Connection': 'keep-alive',

            'Host': 'www.jisilu.cn', 'Pragma': 'no-cache', 'Referer': 'https://www.jisilu.cn/',

            'Upgrade-Insecure-Requests': '1',

            'User-Agent': 'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99Safari/537.36'}



        yield Request(url=login_url, headers=headers, callback=self.login,dont_filter=True)



    def login(self, response):

        url = 'https://www.jisilu.cn/account/ajax/login_process/'

        data = {

            'return_url': 'https://www.jisilu.cn/',

            'user_name': config.username,

            'password': config.password,

            'net_auto_login': '1',

            '_post_type': 'ajax',

        }



        yield FormRequest(

            url=url,

            headers=self.headers,

            formdata=data,

            callback=self.parse,

            dont_filter=True

        )



    def parse(self, response):

        for i in range(1,3726):

            focus_url = 'https://www.jisilu.cn/home/explore/sort_type-new__day-0__page-{}'.format(i)

            yield Request(url=focus_url, headers=self.headers, callback=self.parse_page,dont_filter=True)



    def parse_page(self, response):

        nodes = response.xpath('//div[@class="aw-question-list"]/div')

        for node in nodes:

            each_url=node.xpath('.//h4/a/@href').extract_first()

            yield Request(url=each_url,headers=self.headers,callback=self.parse_item,dont_filter=True)



    def parse_item(self,response):

        item = JslItem()

        title = response.xpath('//div[@class="aw-mod-head"]/h1/text()').extract_first()

        s = response.xpath('//div[@class="aw-question-detail-txt markitup-box"]').xpath('string(.)').extract_first()

        ret = re.findall('(.*?)\.donate_user_avatar', s, re.S)



        try:

            content = ret[0].strip()

        except:

            content = None



        createTime = response.xpath('//div[@class="aw-question-detail-meta"]/span/text()').extract_first()



        resp_no = response.xpath('//div[@class="aw-mod aw-question-detail-box"]//ul/h2/text()').re_first('\d+')



        url = response.url

        item['title'] = title.strip()

        item['content'] = content

        try:

            item['resp_no']=int(resp_no)

        except Exception as e:

            logging.warning('e')

            item['resp_no']=None



        item['createTime'] = createTime

        item['url'] = url.strip()

        resp = 

        for index,reply in enumerate(response.xpath('//div[@class="aw-mod-body aw-dynamic-topic"]/div[@class="aw-item"]')):

            replay_user = reply.xpath('.//div[@class="pull-left aw-dynamic-topic-content"]//p/a/text()').extract_first()

            rep_content = reply.xpath(

                './/div[@class="pull-left aw-dynamic-topic-content"]//div[@class="markitup-box"]/text()').extract_first()

            # print rep_content

            agree=reply.xpath('.//em[@class="aw-border-radius-5 aw-vote-count pull-left"]/text()').extract_first()

            resp.append({replay_user.strip()+'_{}'.format(index): [int(agree),rep_content.strip()]})



        item['resp'] = resp

        yield item

login函数是模拟登录集思录，通过抓包就可以知道一些上传的data。
然后就是分页去抓取。逻辑很简单。

然后pipeline里面写入mongodb。

import pymongo

from collections import OrderedDict

class JslPipeline(object):

    def __init__(self):

        self.db = pymongo.MongoClient(host='10.18.6.1',port=27017)

        # self.user = u'neo牛3' # 修改为指定的用户名 如 毛之川 ，然后找到用户的id，在用户也的源码哪里可以找到 比如持有封基是8132

        self.collection = self.db['db_parker']['jsl']

    def process_item(self, item, spider):

        self.collection.insert(OrderedDict(item))

        return item

抓取到的数据入库mongodb：

点击查看大图

原创文章
转载请注明出处：http://30daydo.com/publish/article/351

docker里运行mongodb，保存的数据在外部使用mongoexport不能导出：提示错误Unrecognized field 'snapshot'

李魔佛发表了文章 • 0 个评论 • 11027 次浏览 • 2018-08-31 14:21 • 来自相关话题

## 2019-03-19更新问题已解决
很无语。目前还找不到原因。

docker里面运行的mongodb， mongodb的数据挂载到宿主机。开放了27017端口。
在windows下使用mongoexport工具导出数据：

错误信息：C:\Program Files\MongoDB\Server\3.4\bin>mongoexport.exe /h 10.18.6.102 /d stock
/c company /o company.json /type json
2018-08-31T14:13:47.841+0800 connected to: 10.18.6.102
2018-08-31T14:13:47.854+0800 Failed: Failed to parse: { find: "company", filt
er: {}, sort: {}, skip: 0, snapshot: true, $readPreference: { mode: "secondaryPr
eferred" }, $db: "stock" }. Unrecognized field 'snapshot'.

C:\Program Files\MongoDB\Server\3.4\bin>
目前这个问题已经解决：
需要进去docker容器里面，然后在容器里面操作，把数据导出来到挂载的目录下，然后可以直接获取到数据了。查看全部

## 2019-03-19更新问题已解决
很无语。目前还找不到原因。

docker里面运行的mongodb， mongodb的数据挂载到宿主机。开放了27017端口。
在windows下使用mongoexport工具导出数据：

错误信息：

C:\Program Files\MongoDB\Server\3.4\bin>mongoexport.exe /h 10.18.6.102 /d stock

/c company /o company.json /type json

2018-08-31T14:13:47.841+0800    connected to: 10.18.6.102

2018-08-31T14:13:47.854+0800    Failed: Failed to parse: { find: "company", filt

er: {}, sort: {}, skip: 0, snapshot: true, $readPreference: { mode: "secondaryPr

eferred" }, $db: "stock" }. Unrecognized field 'snapshot'.



C:\Program Files\MongoDB\Server\3.4\bin>

目前这个问题已经解决：
需要进去docker容器里面，然后在容器里面操作，把数据导出来到挂载的目录下，然后可以直接获取到数据了。

django不同版本的兼容性太麻烦了

李魔佛发表了文章 • 0 个评论 • 3907 次浏览 • 2018-08-26 18:20 • 来自相关话题

对于新人来说太坑爹，不同版本，即使是一个小版本，很多函数都作了修改，或者直接被移除。好坑。

how to use proxy in scrapy_splash ?

python爬虫 • 李魔佛发表了文章 • 3 个评论 • 4344 次浏览 • 2018-08-24 21:44 • 来自相关话题

方法一；
yield scrapy.Request(
url=self.base_url.format(i),
meta={'page':str(i),
'splash': {
'args': {
'images':0,
'wait': 15,
'proxy': self.get_proxy(),
},
'endpoint': 'render.html',
},
},
)

其中get_proxy() 返回的是字符创，类似于 http://8.8.8.8.8:8888 这样的格式代理数据。
这个方式自己试过是可以使用的。

当然也可以使用 scrapy_splash 中的 SplashRequest方法进行调用，参数一样，只是位置有点变化。

方法二是写中间件，不过自己试了很多次，没有成功。感觉网上的都是忽悠。
就是在 process_request中修改 request['splash']['args']['proxy']=xxxxxxx
无效，另外一个朋友也沟通过，也是说无法生效。

如果有人成功了的话，可以私信交流交流。
查看全部

方法一；

yield scrapy.Request(

                url=self.base_url.format(i),

                meta={'page':str(i),

                    'splash': {

                        'args': {

                            'images':0,

                            'wait': 15,

                            'proxy': self.get_proxy(),

                        },

                        'endpoint': 'render.html',

                        },

            },

            )

其中get_proxy() 返回的是字符创，类似于 http://8.8.8.8.8:8888 这样的格式代理数据。
这个方式自己试过是可以使用的。

当然也可以使用 scrapy_splash 中的 SplashRequest方法进行调用，参数一样，只是位置有点变化。

方法二是写中间件，不过自己试了很多次，没有成功。感觉网上的都是忽悠。
就是在 process_request中修改 request['splash']['args']['proxy']=xxxxxxx
无效，另外一个朋友也沟通过，也是说无法生效。

如果有人成功了的话，可以私信交流交流。

python mongodb大数据(>3GB)转移Mysql数据库

李魔佛发表了文章 • 0 个评论 • 5605 次浏览 • 2018-08-20 15:44 • 来自相关话题

数据约为5GB左右,如果直接用for i in doc.find({})进行逐行遍历的话,游标就会超时,而且越到后面速度越慢.

于是使用了分段遍历的方法.# -*-coding=utf-8-*-
import pandas as pd
import json
import pymongo
from sqlalchemy import create_engine

# 将mongo数据转移到mysql

client = pymongo.MongoClient('xxx')
doc = client['spider']['meituan']
engine = create_engine('mysql+pymysql://xxx:xxx@xxx:/xxx?charset=utf8')

def classic_method():
temp =
start = 0
# 数据太大还是会爆内存,或者游标丢失
for i in doc.find().batch_size(500):
start += 1
del i['_id']
temp.append(i)
print(start)

print('start to save to mysql')
df = pd.read_json(json.dumps(temp))
df = df.set_index('poiid', drop=True)
df.to_sql('meituan', con=engine, if_exists='replace')
print('done')

def chunksize_move():
block = 10000
total = doc.find({}).count()
iter_number = total // block

for i in range(iter_number + 1):
small_part = doc.find({}).limit(block).skip(i * block)
list_data =

for item in small_part:
del item['_id']
del item['crawl_time']
item['poiid'] = int(item['poiid'])
for k, v in item.items():
if isinstance(v, dict) or isinstance(v, list):

item[k] = json.dumps(v, ensure_ascii=False)

list_data.append(item)

df = pd.DataFrame(list_data)
df = df.set_index('poiid', drop=True)

try:
df.to_sql('meituan', con=engine, if_exists='append')
print('to sql {}'.format(i))
except Exception as e:
print(e)

chunksize_move()

速度比一次批量的要快不少. 查看全部

数据约为5GB左右,如果直接用

for i in doc.find({})

进行逐行遍历的话,游标就会超时,而且越到后面速度越慢.

于是使用了分段遍历的方法.

# -*-coding=utf-8-*-

import pandas as pd

import json

import pymongo

from sqlalchemy import create_engine



# 将mongo数据转移到mysql



client = pymongo.MongoClient('xxx')

doc = client['spider']['meituan']

engine = create_engine('mysql+pymysql://xxx:xxx@xxx:/xxx?charset=utf8')





def classic_method():

    temp = 

    start = 0

    # 数据太大还是会爆内存,或者游标丢失

    for i in doc.find().batch_size(500):

        start += 1

        del i['_id']

        temp.append(i)

        print(start)



    print('start to save to mysql')

    df = pd.read_json(json.dumps(temp))

    df = df.set_index('poiid', drop=True)

    df.to_sql('meituan', con=engine, if_exists='replace')

    print('done')





def chunksize_move():

    block = 10000

    total = doc.find({}).count()

    iter_number = total // block



    for i in range(iter_number + 1):

        small_part = doc.find({}).limit(block).skip(i * block)

        list_data = 



        for item in small_part:

            del item['_id']

            del item['crawl_time']

            item['poiid'] = int(item['poiid'])

            for k, v in item.items():

                if isinstance(v, dict) or isinstance(v, list):



                    item[k] = json.dumps(v, ensure_ascii=False)



            list_data.append(item)



        df = pd.DataFrame(list_data)

        df = df.set_index('poiid', drop=True)



        try:

            df.to_sql('meituan', con=engine, if_exists='append')

            print('to sql {}'.format(i))

        except Exception as e:

            print(e)



chunksize_move()

速度比一次批量的要快不少.

python 把mongodb的数据迁移到mysql

李魔佛发表了文章 • 0 个评论 • 5133 次浏览 • 2018-08-20 11:02 • 来自相关话题

代码如下: 很简短.
import pymongo
from setting import get_engine

# 将mongo数据转移到mysql

client = pymongo.MongoClient('10.18.6.101')
doc = client['spider']['meituan']
engine = create_engine('mysql+pymysql://localhost:1234@10.18.4.211/spider?charset=utf8')
temp=[]

for i in doc.find({}):
del i['_id']
temp.append(i)
print('start to save to mysql')
df = pd.read_json(json.dumps(temp))
df = df.set_index('poiid',drop=True)
df.to_sql('meituan',con=engine,if_exists='replace')
print('done')

居然CPU飙到了90%
查看全部

代码如下: 很简短.

import pymongo

from setting import get_engine



# 将mongo数据转移到mysql



client = pymongo.MongoClient('10.18.6.101')

doc = client['spider']['meituan']

engine = create_engine('mysql+pymysql://localhost:1234@10.18.4.211/spider?charset=utf8')

temp=[]



for i in doc.find({}):

    del i['_id']

    temp.append(i)

print('start to save to mysql')

df = pd.read_json(json.dumps(temp))

df = df.set_index('poiid',drop=True)

df.to_sql('meituan',con=engine,if_exists='replace')

print('done')

居然CPU飙到了90%

python json.loads 文件中的字典不能用单引号

李魔佛发表了文章 • 0 个评论 • 5642 次浏览 • 2018-08-20 09:28 • 来自相关话题

python json.loads 文件中的字典不能用单引号
只能改成双引号,或者使用

with open('cookies', 'r') as f:
# js = json.load(f)
js=eval(f.read())
# cookie=js.get('Cookie','')
headers = js.get('headers', '')

#content为文件的内容查看全部

python json.loads 文件中的字典不能用单引号

只能改成双引号,或者使用



with open('cookies', 'r') as f:

    # js = json.load(f)

    js=eval(f.read())

# cookie=js.get('Cookie','')

headers = js.get('headers', '')



#content为文件的内容

scrapy记录日志的最新方法

python爬虫 • 李魔佛发表了文章 • 0 个评论 • 4674 次浏览 • 2018-08-15 15:01 • 来自相关话题

旧的方法：from scrapy import log
log.msg("This is a warning", level=log.WARING)

在Spider中添加log

在spider中添加log的推荐方式是使用Spider的 log() 方法。该方法会自动在调用 scrapy.log.start() 时赋值 spider 参数。

其它的参数则直接传递给 msg() 方法

scrapy.log模块scrapy.log.start(logfile=None, loglevel=None, logstdout=None)启动log功能。该方法必须在记录任何信息之前被调用。否则调用前的信息将会丢失。

但是运行的时候出现警告：

[py.warnings] WARNING: E:\git\CrawlMan\bilibili\bilibili\spiders\bili.py:14: ScrapyDeprecationWarning: log.msg has been deprecated, create a python logger and log through it instead
log.msg

原来官方以及不推荐使用log.msg了

最新的用法：# -*- coding: utf-8 -*-
import scrapy
from scrapy_splash import SplashRequest
import logging
# from scrapy import log
class BiliSpider(scrapy.Spider):
name = 'ordinary' # 这个名字就是上面连接中那个启动应用的名字
allowed_domain = ["bilibili.com"]
start_urls = [
"https://www.bilibili.com/"
]

def parse(self, response):
logging.info('====================================================')
content = response.xpath("//div[@class='num-wrap']").extract_first()
logging.info(content)
logging.info('====================================================') 查看全部

旧的方法：

from scrapy import log

log.msg("This is a warning", level=log.WARING)

在Spider中添加log

在spider中添加log的推荐方式是使用Spider的 log() 方法。该方法会自动在调用 scrapy.log.start() 时赋值 spider 参数。

其它的参数则直接传递给 msg() 方法

scrapy.log模块scrapy.log.start(logfile=None, loglevel=None, logstdout=None)启动log功能。该方法必须在记录任何信息之前被调用。否则调用前的信息将会丢失。

但是运行的时候出现警告：



[py.warnings] WARNING: E:\git\CrawlMan\bilibili\bilibili\spiders\bili.py:14: ScrapyDeprecationWarning: log.msg has been deprecated, create a python logger and log through it instead

  log.msg

原来官方以及不推荐使用log.msg了

最新的用法：

# -*- coding: utf-8 -*-

import scrapy

from scrapy_splash import SplashRequest

import logging

# from scrapy import log

class BiliSpider(scrapy.Spider):

    name = 'ordinary'  # 这个名字就是上面连接中那个启动应用的名字

    allowed_domain = ["bilibili.com"]

    start_urls = [

        "https://www.bilibili.com/"

    ]



    def parse(self, response):

        logging.info('====================================================')

        content = response.xpath("//div[@class='num-wrap']").extract_first()

        logging.info(content)

        logging.info('====================================================')

adbapi查询语句 -- python3

李魔佛发表了文章 • 0 个评论 • 4444 次浏览 • 2018-08-12 19:40 • 来自相关话题

Introduction to Twisted Enterprise
Abstract

Twisted is an asynchronous networking framework, but most database API implementations unfortunately have blocking interfaces -- for this reason, twisted.enterprise.adbapi was created. It is a non-blocking interface to the standardized DB-API 2.0 API, which allows you to access a number of different RDBMSes.

What you should already know

Python :-)
How to write a simple Twisted Server (see this tutorial to learn how)
Familiarity with using database interfaces (see the documentation for DBAPI 2.0 or this article by Andrew Kuchling)

Quick Overview

Twisted is an asynchronous framework. This means standard database modules cannot be used directly, as they typically work something like:# Create connection... db = dbmodule.connect('mydb', 'andrew', 'password') # ...which blocks for an unknown amount of time # Create a cursor cursor = db.cursor() # Do a query... resultset = cursor.query('SELECT * FROM table WHERE ...') # ...which could take a long time, perhaps even minutes.Those delays are unacceptable when using an asynchronous framework such as Twisted. For this reason, twisted provides twisted.enterprise.adbapi, an asynchronous wrapper for any DB-API 2.0-compliant module. It is currently best tested with the pyPgSQL module for PostgreSQL.

enterprise.adbapi will do blocking database operations in seperate threads, which trigger callbacks in the originating thread when they complete. In the meantime, the original thread can continue doing normal work, like servicing other requests.

How do I use adbapi?

Rather than creating a database connection directly, use the adbapi.ConnectionPool class to manage a connections for you. This allows enterprise.adbapi to use multiple connections, one per thread. This is easy:# Using the "dbmodule" from the previous example, create a ConnectionPool from twisted.enterprise import adbapi dbpool = adbapi.ConnectionPool("dbmodule", 'mydb', 'andrew', 'password')Things to note about doing this:

There is no need to import dbmodule directly. You just pass the name to adbapi.ConnectionPool's constructor.
The parameters you would pass to dbmodule.connect are passed as extra arguments to adbapi.ConnectionPool's constructor. Keyword parameters work as well.
You may also control the size of the connection pool with the keyword parameters cp_min and cp_max. The default minimum and maximum values are 3 and 5.

So, now you need to be able to dispatch queries to your ConnectionPool. We do this by subclassing adbapi.Augmentation. Here's an example:class AgeDatabase(adbapi.Augmentation): """A simple example that can retrieve an age from the database""" def getAge(self, name): # Define the query sql = """SELECT Age FROM People WHERE name = ?""" # Run the query, and return a Deferred to the caller to add # callbacks to. return self.runQuery(sql, name) def gotAge(resultlist, name): """Callback for handling the result of the query""" age = resultlist[0][0] # First field of first record print "%s is %d years old" % (name, age) db = AgeDatabase(dbpool) # These will *not* block. Hooray! db.getAge("Andrew").addCallbacks(gotAge, db.operationError, callbackArgs=("Andrew",)) db.getAge("Glyph").addCallbacks(gotAge, db.operationError, callbackArgs=("Glyph",)) # Of course, nothing will happen until the reactor is started from twisted.internet import reactor reactor.run()This is straightforward, except perhaps for the return value of getAge. It returns a twisted.internet.defer.Deferred, which allows arbitrary callbacks to be called upon completion (or upon failure). More documentation on Deferred is available here.

Also worth noting is that this example assumes that dbmodule uses the qmarks paramstyle (see the DB-API specification). If your dbmodule uses a different paramstyle (e.g. pyformat) then use that. Twisted doesn't attempt to offer any sort of magic paramater munging -- runQuery(query, params, ...) maps directly onto cursor.execute(query, params, ...).

And that's it!

That's all you need to know to use a database from within Twisted. You probably should read the adbapi module's documentation to get an idea of the other functions it has, but hopefully this document presents the core ideas. 查看全部

Introduction to Twisted Enterprise
Abstract

Twisted is an asynchronous networking framework, but most database API implementations unfortunately have blocking interfaces -- for this reason, twisted.enterprise.adbapi was created. It is a non-blocking interface to the standardized DB-API 2.0 API, which allows you to access a number of different RDBMSes.

What you should already know

Python :-)
How to write a simple Twisted Server (see this tutorial to learn how)
Familiarity with using database interfaces (see the documentation for DBAPI 2.0 or this article by Andrew Kuchling)

Quick Overview

Twisted is an asynchronous framework. This means standard database modules cannot be used directly, as they typically work something like:# Create connection... db = dbmodule.connect('mydb', 'andrew', 'password') # ...which blocks for an unknown amount of time # Create a cursor cursor = db.cursor() # Do a query... resultset = cursor.query('SELECT * FROM table WHERE ...') # ...which could take a long time, perhaps even minutes.Those delays are unacceptable when using an asynchronous framework such as Twisted. For this reason, twisted provides twisted.enterprise.adbapi, an asynchronous wrapper for any DB-API 2.0-compliant module. It is currently best tested with the pyPgSQL module for PostgreSQL.

enterprise.adbapi will do blocking database operations in seperate threads, which trigger callbacks in the originating thread when they complete. In the meantime, the original thread can continue doing normal work, like servicing other requests.

How do I use adbapi?

Rather than creating a database connection directly, use the adbapi.ConnectionPool class to manage a connections for you. This allows enterprise.adbapi to use multiple connections, one per thread. This is easy:# Using the "dbmodule" from the previous example, create a ConnectionPool from twisted.enterprise import adbapi dbpool = adbapi.ConnectionPool("dbmodule", 'mydb', 'andrew', 'password')Things to note about doing this:

There is no need to import dbmodule directly. You just pass the name to adbapi.ConnectionPool's constructor.
The parameters you would pass to dbmodule.connect are passed as extra arguments to adbapi.ConnectionPool's constructor. Keyword parameters work as well.
You may also control the size of the connection pool with the keyword parameters cp_min and cp_max. The default minimum and maximum values are 3 and 5.

So, now you need to be able to dispatch queries to your ConnectionPool. We do this by subclassing adbapi.Augmentation. Here's an example:class AgeDatabase(adbapi.Augmentation): """A simple example that can retrieve an age from the database""" def getAge(self, name): # Define the query sql = """SELECT Age FROM People WHERE name = ?""" # Run the query, and return a Deferred to the caller to add # callbacks to. return self.runQuery(sql, name) def gotAge(resultlist, name): """Callback for handling the result of the query""" age = resultlist[0][0] # First field of first record print "%s is %d years old" % (name, age) db = AgeDatabase(dbpool) # These will *not* block. Hooray! db.getAge("Andrew").addCallbacks(gotAge, db.operationError, callbackArgs=("Andrew",)) db.getAge("Glyph").addCallbacks(gotAge, db.operationError, callbackArgs=("Glyph",)) # Of course, nothing will happen until the reactor is started from twisted.internet import reactor reactor.run()This is straightforward, except perhaps for the return value of getAge. It returns a twisted.internet.defer.Deferred, which allows arbitrary callbacks to be called upon completion (or upon failure). More documentation on Deferred is available here.

Also worth noting is that this example assumes that dbmodule uses the qmarks paramstyle (see the DB-API specification). If your dbmodule uses a different paramstyle (e.g. pyformat) then use that. Twisted doesn't attempt to offer any sort of magic paramater munging -- runQuery(query, params, ...) maps directly onto cursor.execute(query, params, ...).

And that's it!

That's all you need to know to use a database from within Twisted. You probably should read the adbapi module's documentation to get an idea of the other functions it has, but hopefully this document presents the core ideas.

python判断身份证的合法性

李魔佛发表了文章 • 0 个评论 • 6685 次浏览 • 2018-08-10 13:56 • 来自相关话题

输入身份证号码, 判断18位身份证号码是否合法, 并查询信息(性别, 年龄, 所在地)

验证原理

将前面的身份证号码17位数分别乘以不同的系数, 从第一位到第十七位的系数分别为: 7 9 10 5 8 4 2 1 6 3 7 9 10 5 8 4 2
将这17位数字和系数相乘的结果相加.
用加出来和除以11, 看余数是多少?
余数只可能有<0 1 2 3 4 5 6 7 8 9 10>这11个数字, 其分别对应的最后一位身份证的号码为<1 0 X 9 8 7 6 5 4 3 2>.
通过上面得知如果余数是2，就会在身份证的第18位数字上出现罗马数字的Ⅹ。如果余数是10，身份证的最后一位号码就是2.

例如: 某男性的身份证号码是34052419800101001X, 我们要看看这个身份证是不是合法的身份证.

首先: 我们得出, 前17位的乘积和是189.

然后: 用189除以11得出的余数是2.

最后: 通过对应规则就可以知道余数2对应的数字是x. 所以, 这是一个合格的身份证号码.

代码如下：#!/bin/env python
# -*- coding: utf-8 -*-

from sys import platform
import json
import codecs

with codecs.open('data.json', 'r', encoding='utf8') as json_data:
city = json.load(json_data)

def check_valid(idcard):
# 城市编码, 出生日期, 归属地
city_id = idcard[:6]
print(city_id)
birth = idcard[6:14]

city_name = city.get(city_id,'Not found')

# 根据规则校验身份证是否符合规则
idcard_tuple = [int(num) for num in list(idcard[:-1])]
coefficient = [7, 9, 10, 5, 8, 4, 2, 1, 6, 3, 7, 9, 10, 5, 8, 4, 2]
sum_value = sum([idcard_tuple[i] * coefficient[i] for i in range(17)])

remainder = sum_value % 11

maptable = {0: '1', 1: '0', 2: 'x', 3: '9', 4: '8', 5: '7', 6: '6', 7: '5', 8: '4', 9: '3', 10: '2'}

if maptable[remainder] == idcard[17]:
print('<身份证合法>')
sex = int(idcard[16]) % 2
sex = '男' if sex == 1 else '女'
print('性别：' + sex)
birth_format="{}年{}月{}日".format(birth[:4],birth[4:6],birth[6:8])
print('出生日期:' + birth_format)
print('归属地:' + city_name)
return True
else:
print('<身份证不合法>')
return False

if __name__=='__main__':
idcard = str(input('请输入身份证号码：'))
check_valid(idcard)[/i]

github源码：https://github.com/Rockyzsu/IdentityCheck
原创文章，转载请注明
http://30daydo.com/article/340
查看全部

输入身份证号码, 判断18位身份证号码是否合法, 并查询信息(性别, 年龄, 所在地)

验证原理

将前面的身份证号码17位数分别乘以不同的系数, 从第一位到第十七位的系数分别为: 7 9 10 5 8 4 2 1 6 3 7 9 10 5 8 4 2
将这17位数字和系数相乘的结果相加.
用加出来和除以11, 看余数是多少?
余数只可能有<0 1 2 3 4 5 6 7 8 9 10>这11个数字, 其分别对应的最后一位身份证的号码为<1 0 X 9 8 7 6 5 4 3 2>.
通过上面得知如果余数是2，就会在身份证的第18位数字上出现罗马数字的Ⅹ。如果余数是10，身份证的最后一位号码就是2.

例如: 某男性的身份证号码是34052419800101001X, 我们要看看这个身份证是不是合法的身份证.

首先: 我们得出, 前17位的乘积和是189.

然后: 用189除以11得出的余数是2.

最后: 通过对应规则就可以知道余数2对应的数字是x. 所以, 这是一个合格的身份证号码.

代码如下：

#!/bin/env python

# -*- coding: utf-8 -*-



from sys import platform

import json

import codecs



with codecs.open('data.json', 'r', encoding='utf8') as json_data:

    city = json.load(json_data)



def check_valid(idcard):

    # 城市编码, 出生日期, 归属地

    city_id = idcard[:6]

    print(city_id)

    birth = idcard[6:14]



    city_name = city.get(city_id,'Not found')



    # 根据规则校验身份证是否符合规则

    idcard_tuple = [int(num) for num in list(idcard[:-1])]

    coefficient = [7, 9, 10, 5, 8, 4, 2, 1, 6, 3, 7, 9, 10, 5, 8, 4, 2]

    sum_value = sum([idcard_tuple[i] * coefficient[i] for i in range(17)])



    remainder = sum_value % 11



    maptable = {0: '1', 1: '0', 2: 'x', 3: '9', 4: '8', 5: '7', 6: '6', 7: '5', 8: '4', 9: '3', 10: '2'}



    if maptable[remainder] == idcard[17]:

        print('<身份证合法>')

        sex = int(idcard[16]) % 2

        sex = '男' if sex == 1 else '女'

        print('性别：' + sex)

        birth_format="{}年{}月{}日".format(birth[:4],birth[4:6],birth[6:8])

        print('出生日期:' + birth_format)

        print('归属地:' + city_name)

        return True

    else:

        print('<身份证不合法>')

        return False





if __name__=='__main__':

    idcard = str(input('请输入身份证号码：'))

    check_valid(idcard)[/i]

github源码：https://github.com/Rockyzsu/IdentityCheck
原创文章，转载请注明
http://30daydo.com/article/340

想写一个爬取开奖数据并预测下一期的py

贡献

python爬虫 • 李魔佛回复了问题 • 2 人关注 • 1 个回复 • 6262 次浏览 • 2018-08-10 00:22 • 来自相关话题

python sqlalchemy ORM 添加注释

李魔佛发表了文章 • 0 个评论 • 4398 次浏览 • 2018-06-25 16:17 • 来自相关话题

需要更新sqlalchemy到最新版本，旧版本会不支持。

在定义ORM对象的时候，
class CreditRecord(Base):
__tablename__ = 'tb_PersonPunishment'

id = Column(Integer, primary_key=True, autoincrement=True)
name = Column(String(180),comment='名字')
添加一个comment参数即可。

查看全部

需要更新sqlalchemy到最新版本，旧版本会不支持。

在定义ORM对象的时候，

class CreditRecord(Base):

    __tablename__ = 'tb_PersonPunishment'



    id = Column(Integer, primary_key=True, autoincrement=True)

    name = Column(String(180),comment='名字')

添加一个comment参数即可。

windows 7 python3 安装MySQLdb 库

李魔佛发表了文章 • 0 个评论 • 3254 次浏览 • 2018-06-20 18:04 • 来自相关话题

python3下没有MySQLdb的库，可以直接到这里下载mysqlclient库来替代。https://www.lfd.uci.edu/~gohlke/pythonlibs/#mysqlclient

python3中定义抽象类的方法在python2中不兼容

李魔佛发表了文章 • 0 个评论 • 4997 次浏览 • 2018-06-10 20:54 • 来自相关话题

在python3中新式的定义抽象类的方法如下：from abc import ABCMeta,abstractmethod

class Server(metaclass=ABCMeta):

@abstractmethod
def __init__(self):
pass

def __str__(self):
return self.name

@abstractmethod
def boot(self):
pass

@abstractmethod
def kill(self):
pass

但是这个方法在python2中会提示语法错误。

在python2中只能像下面这种方式定义抽象类：
from abc import ABCMeta,abstractmethod

class Server(object):
__metaclass__=ABCMeta
@abstractmethod
def __init__(self):
pass

def __str__(self):
return self.name

@abstractmethod
def boot(self):
pass

@abstractmethod
def kill(self):
pass
这种方式不仅在python2中可以正常运行，在python3中也可以。但是python3的方法只能兼容python3，无法在python2中运行。

原创地址：
http://30daydo.com/article/326
欢迎转载，请注明出处。查看全部

在python3中新式的定义抽象类的方法如下：

from abc import ABCMeta,abstractmethod



class Server(metaclass=ABCMeta):

	

	@abstractmethod

	def __init__(self):

		pass



	def __str__(self):

		return self.name



	@abstractmethod

	def boot(self):

		pass



	@abstractmethod

	def kill(self):

		pass

但是这个方法在python2中会提示语法错误。

在python2中只能像下面这种方式定义抽象类：

from abc import ABCMeta,abstractmethod



class Server(object):

	__metaclass__=ABCMeta

	@abstractmethod

	def __init__(self):

		pass



	def __str__(self):

		return self.name



	@abstractmethod

	def boot(self):

		pass



	@abstractmethod

	def kill(self):

		pass

这种方式不仅在python2中可以正常运行，在python3中也可以。但是python3的方法只能兼容python3，无法在python2中运行。

原创地址：
http://30daydo.com/article/326
欢迎转载，请注明出处。

numpy数组四舍五入

李魔佛发表了文章 • 0 个评论 • 14416 次浏览 • 2018-05-21 09:17 • 来自相关话题

numpy.around(nlist, number)
传入一个np的数组和需要保留的位数作为参数

例子：import numpy as np
x = np.arange(10)
x=x/77.0
print x
输出结果为：[b][0. 0.01298701 0.02597403 0.03896104 0.05194805 0.06493506
0.07792208 0.09090909 0.1038961 0.11688312][/b] [b]np.around(x, 3) #保存为3位小数[/b]
array([0. , 0.013, 0.026, 0.039, 0.052, 0.065, 0.078, 0.091, 0.104, 0.117]) 查看全部

numpy.around(nlist, number)
传入一个np的数组和需要保留的位数作为参数

例子：

import numpy as np

x = np.arange(10)

x=x/77.0

print x

输出结果为：

[b][0.         0.01298701 0.02597403 0.03896104 0.05194805 0.06493506

 0.07792208 0.09090909 0.1038961  0.11688312][/b]

[b]np.around(x, 3)   #保存为3位小数[/b]

array([0. , 0.013, 0.026, 0.039, 0.052, 0.065, 0.078, 0.091, 0.104, 0.117])

pandas中diff控制移动方向，向上移动

李魔佛发表了文章 • 0 个评论 • 6343 次浏览 • 2018-04-25 20:39 • 来自相关话题

初始化一个dataframe

然后使用默认的diff（periods=1）

行的索引不变，数据被往下拉了一行。当然你也可以使用periods=2 ，那么数据整体会往下移2格。

如果要往上移动，只要把periods的值设为负的就可以了。

查看全部

初始化一个dataframe

然后使用默认的diff（periods=1）

行的索引不变，数据被往下拉了一行。当然你也可以使用periods=2 ，那么数据整体会往下移2格。

如果要往上移动，只要把periods的值设为负的就可以了。

python安装mpl_finance [finance模块已经从matplotlib2.0.2中脱离出来]

李魔佛发表了文章 • 0 个评论 • 19674 次浏览 • 2018-04-23 23:17 • 来自相关话题

最新的matplotlib中已经把其中的finance库脱离出来，目前还没有放入PIP的仓库中，所以使用pip install mpl_finance会提示找不到所需要的库.

解决办法：
到官方github中下载源码，然后在本地安装即可。目前的mpl_finance的版本还是dev版，不过用起来也没什么大问题。

git clone git@github.com:matplotlib/mpl_finance.git

等待下载后，进入该目录， sudo python setup.py install

OK
查看全部

最新的matplotlib中已经把其中的finance库脱离出来，目前还没有放入PIP的仓库中，所以使用pip install mpl_finance会提示找不到所需要的库.

解决办法：
到官方github中下载源码，然后在本地安装即可。目前的mpl_finance的版本还是dev版，不过用起来也没什么大问题。

git clone git@github.com:matplotlib/mpl_finance.git

等待下载后，进入该目录， sudo python setup.py install

OK

python取出两个两个同样表结构的MySQL数据库中不同的行

李魔佛发表了文章 • 0 个评论 • 3891 次浏览 • 2018-04-14 11:11 • 来自相关话题

因为平时有本地数据库和远程数据库，本地的时候是离线的时候看的。有时候因为修改代码的缘故，导致远程数据和本地数据有不一样的地方，那么可以使用python+pandas很简单的筛选出不同的行。

df_new[~(df_new['URL'].isin(df_old['URL'].values))]

其中df_old 为本地的数据库读取的dataframe数据，而df_new 为远程的数据，通过判断唯一的key URL的值来筛选出不同的数据行查看全部

因为平时有本地数据库和远程数据库，本地的时候是离线的时候看的。有时候因为修改代码的缘故，导致远程数据和本地数据有不一样的地方，那么可以使用python+pandas很简单的筛选出不同的行。

df_new[~(df_new['URL'].isin(df_old['URL'].values))]

其中df_old 为本地的数据库读取的dataframe数据，而df_new 为远程的数据，通过判断唯一的key URL的值来筛选出不同的数据行

RuntimeWarning: More than 20 figures have been opened.

李魔佛回复了问题 • 1 人关注 • 1 个回复 • 14538 次浏览 • 2018-04-12 12:40 • 来自相关话题

pycharm Failed with error: Could not read from remote repository. git push 操作

李魔佛回复了问题 • 1 人关注 • 1 个回复 • 6432 次浏览 • 2018-03-14 00:31 • 来自相关话题

urlparse中defrag函数的用法

李魔佛发表了文章 • 0 个评论 • 3406 次浏览 • 2018-03-11 17:59 • 来自相关话题

urlparse.urldefrag(url)¶

If url contains a fragment identifier, returns a modified version of url with no fragment identifier, and the fragment identifier as a separate string. If there is no fragment identifier in url, returns url unmodified and an empty string.

官网的解释如上，作用就是把url中的fragment标识符去掉。What ？
fragment标识符是url中#号的部分。
比如 http://www.example.com/index.html#print

#代表网页中的一个位置。其右面的字符，就是该位置的标识符。

就代表网页index.html的print位置。浏览器读取这个URL后，会自动将print位置滚动至可视区域。

为网页位置指定标识符，有两个方法。一是使用锚点，比如<a name="print"></a>，二是使用id属性，比如<div id="print" >。

所以：
url='http://www.example.com/index.html#print'
url=urlparse.defrag(url)
那么返回的url是http://www.example.com/index.html，因为这两个页面实际是同一个url，在爬虫程序中可以用来过滤同一个页面查看全部

urlparse.urldefrag(url)¶

If url contains a fragment identifier, returns a modified version of url with no fragment identifier, and the fragment identifier as a separate string. If there is no fragment identifier in url, returns url unmodified and an empty string.

官网的解释如上，作用就是把url中的fragment标识符去掉。What ？
fragment标识符是url中#号的部分。
比如 http://www.example.com/index.html#print

#代表网页中的一个位置。其右面的字符，就是该位置的标识符。

就代表网页index.html的print位置。浏览器读取这个URL后，会自动将print位置滚动至可视区域。

为网页位置指定标识符，有两个方法。一是使用锚点，比如<a name="print"></a>，二是使用id属性，比如<div id="print" >。

所以：
url='http://www.example.com/index.html#print'
url=urlparse.defrag(url)
那么返回的url是http://www.example.com/index.html，因为这两个页面实际是同一个url，在爬虫程序中可以用来过滤同一个页面

strptime修改默认年份，datetime - strptime默认值为 1900

李魔佛发表了文章 • 0 个评论 • 4536 次浏览 • 2018-03-07 08:42 • 来自相关话题

比如
s='03-06 18:36'
news_time_f=datetime.datetime.strptime(s,%m-%d %H:%M')
print news_time_f

返回来的结果是datetime类型，但是年份是1900年。
1900-03-06 18:36:00

有两种办法：
1. 在日期格式前人为添加年份
news_time_f=datetime.datetime.strptime(''s,'%Y-%m-%d %H:%M')

2.使用自带的replace函数
s='03-06 18:36'
news_time_f=datetime.datetime.strptime(s,%m-%d %H:%M')
news_time_f=news_time_f.replace(2018)

上面两种方法都可以把03-06 18:36
转换为2018-03-06 18:36:00的datetime类型查看全部

比如
s='03-06 18:36'
news_time_f=datetime.datetime.strptime(s,%m-%d %H:%M')
print news_time_f

返回来的结果是datetime类型，但是年份是1900年。
1900-03-06 18:36:00

有两种办法：
1. 在日期格式前人为添加年份
news_time_f=datetime.datetime.strptime(''s,'%Y-%m-%d %H:%M')

2.使用自带的replace函数
s='03-06 18:36'
news_time_f=datetime.datetime.strptime(s,%m-%d %H:%M')
news_time_f=news_time_f.replace(2018)

上面两种方法都可以把03-06 18:36
转换为2018-03-06 18:36:00的datetime类型

python预测下一期双色球号码【机器学习】

李魔佛发表了文章 • 2 个评论 • 16986 次浏览 • 2018-02-19 10:16 • 来自相关话题

Task：
1. 在福彩官网抓取所有的历史双色球数据。
2. 使用历史数据进行继续学习。

Part1 数据抓取

main.pyimport re
from bs4 import BeautifulSoup
import urllib2
from mylog import MyLog as mylog
from save2excel import SavaBallDate

class DoubleColorBallItem(object):
date = None
order = None
red1 = None
red2 = None
red3 = None
red4 = None
red5 = None
red6 = None
blue = None
money = None
firstPrize = None
secondPrize = None

class GetDoubleColorBallNumber(object):
'''这个类用于获取双色球中奖号码，返回一个txt文件
'''
def __init__(self):
self.urls =
self.log = mylog()
self.getUrls()
self.items = self.spider(self.urls)
self.pipelines(self.items)
self.log.info('beging save data to excel \r\n')
SavaBallDate(self.items)
self.log.info('save data to excel end ...\r\n')

def getUrls(self):
'''获取数据来源网页
'''
URL = r'http://kaijiang.zhcw.com/zhcw/html/ssq/list_1.html'
htmlContent = self.getResponseContent(URL)
soup = BeautifulSoup(htmlContent, 'lxml')
tag = soup.find_all(re.compile('p'))[-1]
pages = tag.strong.get_text()
for i in xrange(1, int(pages)+1):
url = r'http://kaijiang.zhcw.com/zhcw/html/ssq/list_' + str(i) + '.html'
self.urls.append(url)
self.log.info(u'添加URL:%s 到URLS \r\n' %url)

def getResponseContent(self, url):
'''这里单独使用一个函数返回页面返回值，是为了后期方便的加入proxy和headers等
'''
try:
response = urllib2.urlopen(url.encode('utf8'))
except:
self.log.error(u'Python 返回URL:%s 数据失败 \r\n' %url)
else:
self.log.info(u'Python 返回URUL:%s 数据成功 \r\n' %url)
return response.read()

def spider(self,urls):
'''这个函数的作用是从获取的数据中过滤得到中奖信息
'''
items =
for url in urls:
htmlContent = self.getResponseContent(url)
soup = BeautifulSoup(htmlContent, 'lxml')
tags = soup.find_all('tr', attrs={})
for tag in tags:
if tag.find('em'):
item = DoubleColorBallItem()
tagTd = tag.find_all('td')
item.date = tagTd[0].get_text()
item.order = tagTd[1].get_text()
tagEm = tagTd[2].find_all('em')
item.red1 = tagEm[0].get_text()
item.red2 = tagEm[1].get_text()
item.red3 = tagEm[2].get_text()
item.red4 = tagEm[3].get_text()
item.red5 = tagEm[4].get_text()
item.red6 = tagEm[5].get_text()
item.blue = tagEm[6].get_text()
item.money = tagTd[3].find('strong').get_text()
item.firstPrize = tagTd[4].find('strong').get_text()
item.secondPrize = tagTd[5].find('strong').get_text()
items.append(item)
self.log.info(u'获取日期为:%s 的数据成功' %(item.date))
return items

def pipelines(self,items):
fileName = u'双色球.txt'.encode('GBK')
with open(fileName, 'w') as fp:
for item in items:
fp.write('%s %s \t %s %s %s %s %s %s %s \t %s \t %s %s \n'
%(item.date,item.order,item.red1,item.red2,item.red3,item.red4,item.red5,item.red6,item.blue,item.money,item.firstPrize,item.secondPrize))
self.log.info(u'将日期为:%s 的数据存入"%s"...' %(item.date, fileName.decode('GBK')))

if __name__ == '__main__':
GDCBN = GetDoubleColorBallNumber()

from mylog import MyLog as mylog
from save2excel import SavaBallDate
这两个模块四用来显示log和保存为excel数据。import logging
import getpass
import sys

#### 定义MyLog类
class MyLog(object):
#### 类MyLog的构造函数
def __init__(self):
self.user = getpass.getuser()
self.logger = logging.getLogger(self.user)
self.logger.setLevel(logging.DEBUG)

#### 日志文件名
self.logFile = sys.argv[0][0:-3] + '.log'
self.formatter = logging.Formatter('%(asctime)-12s %(levelname)-8s %(name)-10s %(message)-12s\r\n')

#### 日志显示到屏幕上并输出到日志文件内
self.logHand = logging.FileHandler(self.logFile, encoding='utf8')
self.logHand.setFormatter(self.formatter)
self.logHand.setLevel(logging.DEBUG)

self.logHandSt = logging.StreamHandler()
self.logHandSt.setFormatter(self.formatter)
self.logHandSt.setLevel(logging.DEBUG)

self.logger.addHandler(self.logHand)
self.logger.addHandler(self.logHandSt)

#### 日志的5个级别对应以下的5个函数
def debug(self,msg):
self.logger.debug(msg)

def info(self,msg):
self.logger.info(msg)

def warn(self,msg):
self.logger.warn(msg)

def error(self,msg):
self.logger.error(msg)

def critical(self,msg):
self.logger.critical(msg)

if __name__ == '__main__':
mylog = MyLog()
mylog.debug(u"I'm debug 测试中文")
mylog.info("I'm info")
mylog.warn("I'm warn")
mylog.error(u"I'm error 测试中文")
mylog.critical("I'm critical")

import xlwt
class SavaBallDate(object):
def __init__(self, items):
self.items = items
self.run(self.items)

def run(self,items):
fileName = u'双色球.xls'.encode('GBK')
book = xlwt.Workbook(encoding='utf8')
sheet=book.add_sheet('ball', cell_overwrite_ok=True)
sheet.write(0, 0, u'开奖日期'.encode('utf8'))
sheet.write(0, 1, u'期号'.encode('utf8'))
sheet.write(0, 2, u'红1'.encode('utf8'))
sheet.write(0, 3, u'红2'.encode('utf8'))
sheet.write(0, 4, u'红3'.encode('utf8'))
sheet.write(0, 5, u'红4'.encode('utf8'))
sheet.write(0, 6, u'红5'.encode('utf8'))
sheet.write(0, 7, u'红6'.encode('utf8'))
sheet.write(0, 8, u'蓝'.encode('utf8'))
sheet.write(0, 9, u'销售金额'.encode('utf8'))
sheet.write(0, 10, u'一等奖'.encode('utf8'))
sheet.write(0, 11, u'二等奖'.encode('utf8'))
i = 1
while i <= len(items):
item = items[i-1]
sheet.write(i, 0, item.date)
sheet.write(i, 1, item.order)
sheet.write(i, 2, item.red1)
sheet.write(i, 3, item.red2)
sheet.write(i, 4, item.red3)
sheet.write(i, 5, item.red4)
sheet.write(i, 6, item.red5)
sheet.write(i, 7, item.red6)
sheet.write(i, 8, item.blue)
sheet.write(i, 9, item.money)
sheet.write(i, 10, item.firstPrize)
sheet.write(i, 11, item.secondPrize)
i += 1
book.save(fileName)

if __name__ == '__main__':
pass

运行脚本后，会在本地生成一个excel文件，保存为一下的格式：

第一步完成。
Part2 机器学习进行预测
2018-02-19
待续，后面继续更新
原文链接：
http://30daydo.com/article/277
查看全部

Task：
1. 在福彩官网抓取所有的历史双色球数据。
2. 使用历史数据进行继续学习。

Part1 数据抓取

main.py

import re

from bs4 import BeautifulSoup

import urllib2

from mylog import MyLog as mylog

from save2excel import SavaBallDate



class DoubleColorBallItem(object):

    date = None

    order = None

    red1 = None

    red2 = None

    red3 = None

    red4 = None

    red5 = None

    red6 = None

    blue = None

    money = None

    firstPrize = None

    secondPrize = None



class GetDoubleColorBallNumber(object):

    '''这个类用于获取双色球中奖号码， 返回一个txt文件

    '''

    def __init__(self):

        self.urls = 

        self.log = mylog()

        self.getUrls()

        self.items = self.spider(self.urls)

        self.pipelines(self.items)

        self.log.info('beging save data to excel \r\n')

        SavaBallDate(self.items)

        self.log.info('save data to excel end ...\r\n')

      

        

    def getUrls(self):

        '''获取数据来源网页

        '''

        URL = r'http://kaijiang.zhcw.com/zhcw/html/ssq/list_1.html'

        htmlContent = self.getResponseContent(URL)

        soup = BeautifulSoup(htmlContent, 'lxml')

        tag = soup.find_all(re.compile('p'))[-1]

        pages = tag.strong.get_text()        

        for i in xrange(1, int(pages)+1):

            url = r'http://kaijiang.zhcw.com/zhcw/html/ssq/list_' + str(i) + '.html'

            self.urls.append(url)

            self.log.info(u'添加URL:%s 到URLS \r\n' %url)

            

    def getResponseContent(self, url):

        '''这里单独使用一个函数返回页面返回值，是为了后期方便的加入proxy和headers等

        '''

        try:

            response = urllib2.urlopen(url.encode('utf8'))

        except:

            self.log.error(u'Python 返回URL:%s  数据失败  \r\n' %url)

        else:

            self.log.info(u'Python 返回URUL:%s  数据成功 \r\n' %url)

            return response.read()

        

            

    def spider(self,urls):

        '''这个函数的作用是从获取的数据中过滤得到中奖信息

        '''

        items = 

        for url in urls:

            htmlContent = self.getResponseContent(url)

            soup = BeautifulSoup(htmlContent, 'lxml')

            tags = soup.find_all('tr', attrs={})

            for tag in tags:

                if tag.find('em'):

                    item = DoubleColorBallItem()

                    tagTd = tag.find_all('td')

                    item.date = tagTd[0].get_text()

                    item.order = tagTd[1].get_text()

                    tagEm = tagTd[2].find_all('em')

                    item.red1 = tagEm[0].get_text()

                    item.red2 = tagEm[1].get_text()

                    item.red3 = tagEm[2].get_text()

                    item.red4 = tagEm[3].get_text()

                    item.red5 = tagEm[4].get_text()

                    item.red6 = tagEm[5].get_text()

                    item.blue = tagEm[6].get_text()

                    item.money = tagTd[3].find('strong').get_text()

                    item.firstPrize = tagTd[4].find('strong').get_text()

                    item.secondPrize = tagTd[5].find('strong').get_text()

                    items.append(item)

                    self.log.info(u'获取日期为:%s 的数据成功' %(item.date))

        return items

    

    def pipelines(self,items):

        fileName = u'双色球.txt'.encode('GBK')

        with open(fileName, 'w') as fp:

            for item in items:

                fp.write('%s %s \t %s %s %s %s %s %s  %s \t %s \t %s %s \n'

                      %(item.date,item.order,item.red1,item.red2,item.red3,item.red4,item.red5,item.red6,item.blue,item.money,item.firstPrize,item.secondPrize))

                self.log.info(u'将日期为:%s 的数据存入"%s"...' %(item.date, fileName.decode('GBK')))

                    



if __name__ == '__main__':

    GDCBN = GetDoubleColorBallNumber()

from mylog import MyLog as mylog
from save2excel import SavaBallDate
这两个模块四用来显示log和保存为excel数据。

import logging

import getpass

import sys





#### 定义MyLog类

class MyLog(object):

#### 类MyLog的构造函数

	def __init__(self):

		self.user = getpass.getuser()

		self.logger = logging.getLogger(self.user)

		self.logger.setLevel(logging.DEBUG)



####  日志文件名

		self.logFile = sys.argv[0][0:-3] + '.log'

		self.formatter = logging.Formatter('%(asctime)-12s %(levelname)-8s %(name)-10s %(message)-12s\r\n')



####  日志显示到屏幕上并输出到日志文件内

		self.logHand = logging.FileHandler(self.logFile, encoding='utf8')

		self.logHand.setFormatter(self.formatter)

		self.logHand.setLevel(logging.DEBUG)



		self.logHandSt = logging.StreamHandler()

		self.logHandSt.setFormatter(self.formatter)

		self.logHandSt.setLevel(logging.DEBUG)



		self.logger.addHandler(self.logHand)

		self.logger.addHandler(self.logHandSt)



####  日志的5个级别对应以下的5个函数

	def debug(self,msg):

		self.logger.debug(msg)



	def info(self,msg):

		self.logger.info(msg)



	def warn(self,msg):

		self.logger.warn(msg)



	def error(self,msg):

		self.logger.error(msg)



	def critical(self,msg):

		self.logger.critical(msg)



if __name__ == '__main__':

	mylog = MyLog()

	mylog.debug(u"I'm debug 测试中文")

	mylog.info("I'm info")

	mylog.warn("I'm warn")

	mylog.error(u"I'm error 测试中文")

	mylog.critical("I'm critical")

import xlwt

class SavaBallDate(object):

    def __init__(self, items):

        self.items = items

        self.run(self.items)

        

    def run(self,items):

        fileName = u'双色球.xls'.encode('GBK')

        book = xlwt.Workbook(encoding='utf8')

        sheet=book.add_sheet('ball', cell_overwrite_ok=True)

        sheet.write(0, 0, u'开奖日期'.encode('utf8'))

        sheet.write(0, 1, u'期号'.encode('utf8'))

        sheet.write(0, 2, u'红1'.encode('utf8'))

        sheet.write(0, 3, u'红2'.encode('utf8'))

        sheet.write(0, 4, u'红3'.encode('utf8'))

        sheet.write(0, 5, u'红4'.encode('utf8'))

        sheet.write(0, 6, u'红5'.encode('utf8'))

        sheet.write(0, 7, u'红6'.encode('utf8'))

        sheet.write(0, 8, u'蓝'.encode('utf8'))

        sheet.write(0, 9, u'销售金额'.encode('utf8'))

        sheet.write(0, 10, u'一等奖'.encode('utf8'))

        sheet.write(0, 11, u'二等奖'.encode('utf8'))

        i = 1

        while i <= len(items):

            item = items[i-1]

            sheet.write(i, 0, item.date)

            sheet.write(i, 1, item.order)

            sheet.write(i, 2, item.red1)

            sheet.write(i, 3, item.red2)

            sheet.write(i, 4, item.red3)

            sheet.write(i, 5, item.red4)

            sheet.write(i, 6, item.red5)

            sheet.write(i, 7, item.red6)

            sheet.write(i, 8, item.blue)

            sheet.write(i, 9, item.money)

            sheet.write(i, 10, item.firstPrize)

            sheet.write(i, 11, item.secondPrize)

            i += 1

        book.save(fileName)

        





if __name__ == '__main__':

    pass

运行脚本后，会在本地生成一个excel文件，保存为一下的格式：

第一步完成。
Part2 机器学习进行预测
2018-02-19
待续，后面继续更新
原文链接：
http://30daydo.com/article/277

TypeError: the dtype datetime64 is not supported for parsing

李魔佛回复了问题 • 1 人关注 • 1 个回复 • 11139 次浏览 • 2018-02-05 02:44 • 来自相关话题

Message: invalid selector: Compound class names not permitted

python爬虫 • 李魔佛发表了文章 • 0 个评论 • 4253 次浏览 • 2018-01-30 00:59 • 来自相关话题

使用selenium的时候如果使用了
driver.find_element_by_class_name("content")
使用class名字来查找元素的话，就会出现
Message: invalid selector: Compound class names not permitted
这个错误。

比如京东的登录页面中：
<div id="content">
<div class="login-wrap">
<div class="w">
<div class="login-form">
<div class="login-tab login-tab-l">
<a href="javascript:void(0)" clstag="pageclick|keycount|201607144|1"> 扫码登录</a>
</div>
<div class="login-tab login-tab-r">
<a href="javascript:void(0)" clstag="pageclick|keycount|201607144|2">账户登录</a>
</div>
<div class="login-box">
<div class="mt tab-h">
</div>
<div class="msg-wrap">
<div class="msg-error hide"><b></b></div>
</div>

我要找的是<div class="login-tab login-tab-l">

那么应该使用css选择器：

browser.find_element_by_css_selector('div.login-tab.login-tab-r')
查看全部

使用selenium的时候如果使用了
driver.find_element_by_class_name("content")
使用class名字来查找元素的话，就会出现
Message: invalid selector: Compound class names not permitted
这个错误。

比如京东的登录页面中：

<div id="content">

    <div class="login-wrap">

		<div class="w">

            <div class="login-form">

                <div class="login-tab login-tab-l">

                    <a href="javascript:void(0)" clstag="pageclick|keycount|201607144|1"> 扫码登录</a>

                </div>

                <div class="login-tab login-tab-r">

                    <a href="javascript:void(0)" clstag="pageclick|keycount|201607144|2">账户登录</a>

                </div>

                <div class="login-box">

                    <div class="mt tab-h">

                    </div>

                    <div class="msg-wrap">

						                        <div class="msg-error hide"><b></b></div>

                    </div>

我要找的是<div class="login-tab login-tab-l">

那么应该使用css选择器：

browser.find_element_by_css_selector('div.login-tab.login-tab-r')

Pycharm控制台窗口怎样可以显示不同程序的运行结果

李魔佛发表了文章 • 0 个评论 • 10898 次浏览 • 2018-01-27 20:31 • 来自相关话题

默认情况下，每次运行会把之前的那个结果给清理掉。有时候运行多个程序像对比结果，不太方便。
可以在pycharm的控制台那里点击右键，在弹出的菜单中，选择“Pin Tab”，那么当前的控制台就不会被清掉啦，它可以一直保留着，自带你自己手动去关闭它。

运行python requests/urllib2/urllib3 需要sudo/root权限，为什么？

李魔佛回复了问题 • 1 人关注 • 1 个回复 • 7464 次浏览 • 2018-01-10 23:36 • 来自相关话题

python模拟登录vexx.pro 获取你的总资产/币值和其他个人信息

python爬虫 • 李魔佛发表了文章 • 0 个评论 • 4449 次浏览 • 2018-01-10 03:22 • 来自相关话题

因为每次登录vexx.pro，第一次输入正常的验证码都会说你是错误的，搞得每次都要输入2次验证码，所以为了节省点时间，就写了个模拟登录来自动获取自己的账户信息的python程序。

# -*-coding=utf-8-*-

import requests
session = requests.Session()
user = ''
password = ''

def getCode():
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
url = 'http://vexx.pro/verify/code.html'
s = session.get(url=url, headers=headers)

with open('code.png', 'wb') as f:
f.write(s.content)

code = raw_input('input the code: ')
print 'code is ', code

login_url = 'http://vexx.pro/login/up_login.html'
post_data = {
'moble': user,
'mobles': '+86',
'password': password,
'verify': code,
'login_token': ''}

login_s = session.post(url=login_url, headers=header, data=post_data)
print login_s.status_code

zzc_url = 'http://vexx.pro/ajax/check_zzc/'
zzc_s = session.get(url=zzc_url, headers=headers)
print zzc_s.text

def main():
getCode()

if __name__ == '__main__':
main()

把自己的用户名和密码填上去，中途输入一次验证码。
可以把session保存到本地，然后下一次就可以不用再输入密码。

后记：经过几个月后，这个网站被证实是一个圈钱跑路的网站，目前已经无法正常登陆了。希望大家不要再上当了
原创地址：http://30daydo.com/article/263
转载请注明出处。查看全部

因为每次登录vexx.pro，第一次输入正常的验证码都会说你是错误的，搞得每次都要输入2次验证码，所以为了节省点时间，就写了个模拟登录来自动获取自己的账户信息的python程序。

# -*-coding=utf-8-*-



import requests

session = requests.Session()

user = ''

password = ''



def getCode():

    headers = {

        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}

    url = 'http://vexx.pro/verify/code.html'

    s = session.get(url=url, headers=headers)



    with open('code.png', 'wb') as f:

        f.write(s.content)



    code = raw_input('input the code: ')

    print 'code is ', code



    login_url = 'http://vexx.pro/login/up_login.html'

    post_data = {

        'moble': user,

        'mobles': '+86',

        'password': password,

        'verify': code,

        'login_token': ''}



    login_s = session.post(url=login_url, headers=header, data=post_data)

    print login_s.status_code



    zzc_url = 'http://vexx.pro/ajax/check_zzc/'

    zzc_s = session.get(url=zzc_url, headers=headers)

    print zzc_s.text



def main():

    getCode()



if __name__ == '__main__':

    main()

把自己的用户名和密码填上去，中途输入一次验证码。
可以把session保存到本地，然后下一次就可以不用再输入密码。

后记：经过几个月后，这个网站被证实是一个圈钱跑路的网站，目前已经无法正常登陆了。希望大家不要再上当了
原创地址：http://30daydo.com/article/263
转载请注明出处。

通知设置新通知