Skip to content

Commit

Permalink
remove is_need_unlock (#147)
Browse files Browse the repository at this point in the history
* remove is_need_unlock

* fix some variable
  • Loading branch information
chyroc authored Oct 1, 2017
1 parent 513c135 commit ad07cb0
Show file tree
Hide file tree
Showing 7 changed files with 40 additions and 27 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,5 @@ build/
.hypothesis/
test/.hypothesis/
t.py
y.py
y.py
tencent_captcha/
2 changes: 1 addition & 1 deletion wechatsogou/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

"""
WechatSogou Crawler Library
~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~
"""

Expand Down
24 changes: 15 additions & 9 deletions wechatsogou/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,15 @@

class WechatSogouAPI(object):
def __init__(self, captcha_break_time=1, proxies=None):
"""初始化参数
Parameters
----------
captcha_break_time : int
验证码输入错误重试次数
proxies : dict
代理
"""
assert isinstance(captcha_break_time, int) and 0 < captcha_break_time < 20

self.captcha_break_times = captcha_break_time
Expand All @@ -31,8 +40,10 @@ def __set_cookie(self, suv=None, snuid=None, referer=None):
suv = ws_cache.get('suv') if suv is None else suv
snuid = ws_cache.get('snuid') if snuid is None else snuid

return {'Cookie': 'SUV={};SNUID={};'.format(suv, snuid)} if referer is None else {
'Cookie': 'SUV={};SNUID={};'.format(suv, snuid), 'Referer': referer}
_headers = {'Cookie': 'SUV={};SNUID={};'.format(suv, snuid)}
if referer is not None:
_headers['Referer'] = referer
return _headers

def __set_cache(self, suv, snuid):
ws_cache.set('suv', suv)
Expand Down Expand Up @@ -79,9 +90,8 @@ def __unlock_wechat(self, url, resp, session, unlock_callback=None, identify_ima
'[WechatSogouAPI identify image] code: {ret}, msg: {errmsg}, cookie_count: {cookie_count}'.format(
ret=r_unlock.get('ret'), errmsg=r_unlock.get('errmsg'), cookie_count=r_unlock.get('cookie_count')))

def __get_by_unlock(self, url, referer=None, is_need_unlock=None, unlock_platform=None, unlock_callback=None,
def __get_by_unlock(self, url, referer=None, unlock_platform=None, unlock_callback=None,
identify_image_callback=None):
assert is_need_unlock is None or callable(is_need_unlock)
assert unlock_platform is None or callable(unlock_platform)

if identify_image_callback is None:
Expand All @@ -92,7 +102,7 @@ def __get_by_unlock(self, url, referer=None, is_need_unlock=None, unlock_platfor
session = requests.session()
resp = self.__get(url, session, headers=self.__set_cookie(referer=referer))

if is_need_unlock(resp):
if 'antispider' in resp.url or '请输入验证码' in resp.text:
for i in range(self.captcha_break_times):
try:
unlock_platform(url, resp, session, unlock_callback, identify_image_callback)
Expand Down Expand Up @@ -179,7 +189,6 @@ def search_gzh(self, keyword, page=1, unlock_callback=None, identify_image_callb
"""
url = WechatSogouRequest.gen_search_gzh_url(keyword, page)
resp = self.__get_by_unlock(url,
is_need_unlock=lambda x: 'antispider' in x.url,
unlock_platform=self.__unlock_sogou,
unlock_callback=unlock_callback,
identify_image_callback=identify_image_callback)
Expand Down Expand Up @@ -243,7 +252,6 @@ def search_article(self, keyword, page=1, timesn=WechatSogouConst.search_article
"""
url = WechatSogouRequest.gen_search_article_url(keyword, page, timesn, article_type, ft, et)
resp = self.__get_by_unlock(url, WechatSogouRequest.gen_search_article_url(keyword),
is_need_unlock=lambda x: 'antispider' in x.url,
unlock_platform=self.__unlock_sogou,
unlock_callback=unlock_callback,
identify_image_callback=identify_image_callback)
Expand Down Expand Up @@ -321,7 +329,6 @@ def get_gzh_artilce_by_history(self, keyword=None, url=None,
url = gzh_list['profile_url']

resp = self.__get_by_unlock(url, WechatSogouRequest.gen_search_article_url(keyword),
is_need_unlock=lambda x: '请输入验证码' in x.text,
unlock_platform=self.__unlock_wechat,
unlock_callback=unlock_callback_weixin,
identify_image_callback=identify_image_callback_weixin)
Expand Down Expand Up @@ -362,7 +369,6 @@ def get_gzh_artilce_by_hot(self, hot_index, page=1, unlock_callback=None, identi

url = WechatSogouRequest.gen_hot_url(hot_index, page)
resp = self.__get_by_unlock(url,
is_need_unlock=lambda x: 'antispider' in x.url,
unlock_platform=self.__unlock_sogou,
unlock_callback=unlock_callback,
identify_image_callback=identify_image_callback)
Expand Down
5 changes: 4 additions & 1 deletion wechatsogou/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,10 @@ class _WechatSogouSearchArticleTypeConst(object):

@Const
class _WechatSogouSearchArticleTimeConst(object):
"""时间 0 没有限制 / 1一天 / 2一周 / 3一月 / 4一年 / 5自定"""
"""搜索条件 时间
0 没有限制 / 1一天 / 2一周 / 3一月 / 4一年 / 5自定
"""
anytime = 0
day = 1
week = 2
Expand Down
7 changes: 5 additions & 2 deletions wechatsogou/identify_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from wechatsogou.five import readimg, input
from wechatsogou.filecache import WechatCache
from wechatsogou.exceptions import WechatSogouVcodeOcrException

ws_cache = WechatCache()

Expand Down Expand Up @@ -69,7 +70,8 @@ def unlock_sogou_callback_example(url, req, resp, img, identify_image_callback):
}
r_unlock = req.post(unlock_url, data, headers=headers)
if not r_unlock.ok:
raise Exception() # todo use ws exception
raise WechatSogouVcodeOcrException(
'unlock[{}] failed: {}'.format(unlock_url, r_unlock.text, r_unlock.status_code))

return r_unlock.json()

Expand Down Expand Up @@ -113,6 +115,7 @@ def unlock_weixin_callback_example(url, req, resp, img, identify_image_callback)
}
r_unlock = req.post(unlock_url, data, headers=headers)
if not r_unlock.ok:
raise Exception() # todo use ws exception
raise WechatSogouVcodeOcrException(
'unlock[{}] failed: {}[{}]'.format(unlock_url, r_unlock.text, r_unlock.status_code))

return r_unlock.json()
24 changes: 12 additions & 12 deletions wechatsogou/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
from wechatsogou.five import urlencode
from wechatsogou.const import WechatSogouConst

_search_type_gzh = 1 # 1 是公号
_search_type_article = 2 # 2 是文章
_search_type_gzh = 1 # 公众号
_search_type_article = 2 # 文章


class WechatSogouRequest(object):
Expand Down Expand Up @@ -66,24 +66,24 @@ def gen_search_article_url(keyword, page=1, timesn=WechatSogouConst.search_artic
else:
interation = ''

qsDict = OrderedDict()
qsDict['type'] = _search_type_article
qsDict['page'] = page
qsDict['ie'] = 'utf8'
qsDict['query'] = keyword
qs_dict = OrderedDict()
qs_dict['type'] = _search_type_article
qs_dict['page'] = page
qs_dict['ie'] = 'utf8'
qs_dict['query'] = keyword
qs_dict['interation'] = interation
if timesn != 0:
qsDict['tsn'] = timesn
qsDict['ft'] = str(ft)
qsDict['et'] = str(et)
qsDict['interation'] = interation
qs_dict['tsn'] = timesn
qs_dict['ft'] = str(ft)
qs_dict['et'] = str(et)

# TODO 账号内搜索
# '账号内 http://weixin.sogou.com/weixin?type=2&ie=utf8&query=%E9%AB%98%E8%80%83&tsn=3&ft=&et=&interation=458754
# &wxid=oIWsFt1tmWoG6vO6BcsS7St61bRE&usip=nanhangqinggong'
# qs['wxid'] = wxid
# qs['usip'] = usip

return 'http://weixin.sogou.com/weixin?{}'.format(urlencode(qsDict))
return 'http://weixin.sogou.com/weixin?{}'.format(urlencode(qs_dict))

@staticmethod
def gen_search_gzh_url(keyword, page=1):
Expand Down
2 changes: 1 addition & 1 deletion wechatsogou/structuring.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,7 @@ def get_gzh_artilce_by_hot(text):

try:
send_time = int(send_time[0])
except:
except ValueError:
send_time = send_time[0]

gzh_article_list.append({
Expand Down

0 comments on commit ad07cb0

Please sign in to comment.