原理

用py写了个子域名爬取的脚本,用百度的site查询语法,支持多线程,结果会保存在output/域名.txt下。
原理很简单,用法如下

例如:执行python getSubdomain.py -d qq.com,得到的文件

源文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import requests
from lxml import etree
import time
from fake_useragent import UserAgent
import click
import os
import sys
import threading
import queue


def output():
output=os.path.dirname(__file__)+"\\output"
if not os.path.exists(output):
os.makedirs(output)
return output+'\\'

class getSub(threading.Thread):
def __init__(self,page,site,f):
super().__init__()
self.page=page
self.site=site
self.f=f

def run(self):
while not self.page.empty():
P=self.page.get()
self.getSubdomain(P)

def getSubdomain(self,P):


url="https://www.baidu.com/s?wd=site%3A"+self.site+"&pn="+str(P)
ua=UserAgent().random
header={
'User-Agent':ua
}
session=requests.Session()
html=session.get(url,headers=header)
html.encoding='utf-8'
# print(html.text)
tree=etree.HTML(html.text)
list=tree.xpath('//h3/a/@href')
title=tree.xpath('//h3/a')
a=0
for i in list:
sub=requests.get(i,headers=header).url
time.sleep(1)
print("%s%s :\t\t%s"%(" ",sub.split('//')[1].split('/')[0],title[a].text),file=self.f)
print("%s%s :\t\t%s"%(" ",sub.split('//')[1].split('/')[0],title[a].text),file=sys.stdout)
a+=1


@click.command()
@click.option('-d',default='default',help='the target domain')

def main(d):
if(d=='default'):
use='''
use 'getSubdomain.py --help' to get more help
'''
print(use)
else:
page=queue.Queue()
for i in range(0,40,10):
page.put(i)
threads=[]
threadCount=3
f=open(output()+d+'.txt','a',encoding='utf-8')
for i in range(threadCount):
threads.append(getSub(page,d,f))
for t in threads:
t.start()
for t in threads:
t.join()
print("完成!文件已存入%s"%'output\\'+d+'.txt')


if __name__=='__main__':
main()