1
xiaobai987 2017-07-24 15:27:07 +08:00
代码发上来看看 没代码怎么分析呢
|
2
VicYu 2017-07-24 15:27:20 +08:00
过滤掉 display:none;
|
3
yxy2829 OP 代码如下:
import requests from lxml import etree class Proxy(object): def __init__(self): self.tm_url = 'http://www.goubanjia.com/free/anoy/%E9%80%8F%E6%98%8E/index{page}.shtml' # 1,2,3 def get_proxy(self, url): r = requests.get(url, headers={ 'User-Agent': 'Mozilla/5.0 (Windows x86; rv:19.0) Gecko/20100101 Firefox/19.0,' }) html = etree.HTML(r.content) all_proxy = html.xpath('//table//tr[td]') for i in all_proxy: ip_port = ''.join(i.xpath('./td[1]/span[@style]/text()|' './td[1]/div[@style]/text()|' './td[1]/p[@style]/text()|' './td[1]/text()|' './td[1]/span[@class]/text()')) ip, port = ip_port.split(':') anonymous = i.xpath('./td[2]/a/text()')[0] http_type = ''.join(i.xpath('./td[3]/a/text()')) or 'http' proxy = (ip, port, anonymous, http_type) yield proxy def start(self): for page in range(1, 3): tm_url = self.tm_url.format(page=page) for proxy in self.get_proxy(tm_url): yield proxy if __name__ == '__main__': p = Proxy() for i in p.start(): print i |
5
xiaobai987 2017-07-24 15:42:12 +08:00
端口肯定利用 js 转换了 我再看看
|
6
yxy2829 OP @xiaobai987 好的
|
7
niuoh 2017-07-24 16:03:01 +08:00 1
推荐个用着不错的 爬虫代理 ip-chi.net
|
9
lc4t 2017-07-24 16:08:23 +08:00
```python
# 先测试下能不能识别代码 # 尤其是换行 def _(d): pass ``` |
10
lc4t 2017-07-24 16:09:01 +08:00
翻出来了原来写的。。
```python def goubanjia_com(self, *args): logger.info('giubanjia.com start') i = 1 self.THREAD_ID += 1 while(1): url = 'http://www.goubanjia.com/free/index%d.shtml' % (i) r = requests.get(url, headers=self.http_headers()) if r.status_code == 404: break try: html = BeautifulSoup(r.text, 'lxml') tbody = html.tbody for tr in tbody.find_all('tr'): p = proxy() [x.extract() for x in tr.find_all('p')] try: _ = tr.find_all('td', {'class':"ip"})[0].text _ = _.split(':') p.ip = _[0] p.port = int(_[1]) # p.port = int(tr.find_all('td', {'data-title':"PORT"})[0].text) p.safe = tr.find_all('td')[1].text.replace(' ', '').replace('\n', '').replace('\t', '') p.type = tr.find_all('td')[2].text.replace(' ', '').replace('\n', '').replace('\t', '') p.place = tr.find_all('td')[3].text.replace(' ', '').replace('\n', '').replace('\t', '').replace('\r', '').replace('\xa0', '') p.net = tr.find_all('td')[4].text.replace(' ', '').replace('\n', '').replace('\t', '') except IndexError as e: print(tr) logger.error('%s is index error' % p) # exit(0) logger.debug('<get>%s' % p) self.wait_for_verify.put(p) self.THREAD_ID += 1 self.add_thread(self.verify_proxy_thread, self.THREAD_ID) logger.debug('%s ok' % url) gevent.sleep(1) except AttributeError as e: print(e) # print(r.text) gevent.sleep(10) logger.error('%s Error, sleep 10s' % url) continue # exit() i += 1 ``` |
11
lc4t 2017-07-24 16:11:45 +08:00
|
12
xiaobai987 2017-07-24 16:19:16 +08:00
var _$ = ['.port', "each", "html", "indexOf", '*', "attr", 'class', "split", " ", "", "length", "push", 'ABCDEFGHIZ', "parseInt", "join", ''];
$(function() { $(_$[0])[_$[1]](function() { var a = $(this)[_$[2]](); if (a[_$[3]](_$[4]) != -0x1) { return }; var b = $(this)[_$[5]](_$[6]); try { b = (b[_$[7]](_$[8]))[0x1]; var c = b[_$[7]](_$[9]); var d = c[_$[10]]; var f = []; for (var g = 0x0; g < d; g++) { f[_$[11]](_$[12][_$[3]](c[g])) }; $(this)[_$[2]](window[_$[13]](f[_$[14]](_$[15])) >> 0x3) } catch (e) {} }) }) 这是解密代码哦 |
13
yxy2829 OP |
14
tangzipeng 2017-07-24 18:10:29 +08:00 1
def get_poxy(port_word):
_, word = port_word.split(' ') num_list = [] for item in word: num = 'ABCDEFGHIZ'.find(item) num_list.append(str(num)) port = int("".join(num_list)) >> 0x3 return port 可以试下我这个函数~ 拿到端口那里的大写字母,传进去就返回了~ |
15
xiaobai987 2017-07-25 09:13:33 +08:00
@tangzipeng 厉害 关键还是解密 JS
|
16
yxy2829 OP |