自编子域名获取脚本

原理

用py写了个子域名爬取的脚本，用百度的site查询语法，支持多线程，结果会保存在output/域名.txt下。
原理很简单，用法如下

例如：执行python getSubdomain.py -d qq.com,得到的文件

源文件

import requests
from lxml import etree
import time
from fake_useragent import UserAgent
import click
import os
import sys
import threading
import queue


def output():
    output=os.path.dirname(__file__)+"\\output"
    if not os.path.exists(output):
        os.makedirs(output)
    return output+'\\'
   
class getSub(threading.Thread):
    def __init__(self,page,site,f):
        super().__init__()
        self.page=page
        self.site=site
        self.f=f

    def run(self):
        while not self.page.empty():
            P=self.page.get()
            self.getSubdomain(P)

    def getSubdomain(self,P):

        
        url="https://www.baidu.com/s?wd=site%3A"+self.site+"&pn="+str(P)
        ua=UserAgent().random
        header={
            'User-Agent':ua          
                }
        session=requests.Session()
        html=session.get(url,headers=header)
        html.encoding='utf-8'
        # print(html.text)
        tree=etree.HTML(html.text)
        list=tree.xpath('//h3/a/@href')
        title=tree.xpath('//h3/a')
        a=0        
        for i in list:
            sub=requests.get(i,headers=header).url
            time.sleep(1)      
            print("%s%s :\t\t%s"%("   ",sub.split('//')[1].split('/')[0],title[a].text),file=self.f)
            print("%s%s :\t\t%s"%("   ",sub.split('//')[1].split('/')[0],title[a].text),file=sys.stdout)
            a+=1


@click.command()
@click.option('-d',default='default',help='the target domain')

def main(d):
    if(d=='default'):
        use='''
use 'getSubdomain.py --help' to get more help   
        '''
        print(use)
    else:
        page=queue.Queue()
        for i in range(0,40,10):
            page.put(i)
        threads=[]
        threadCount=3
        f=open(output()+d+'.txt','a',encoding='utf-8')
        for i in range(threadCount):
            threads.append(getSub(page,d,f))
        for t in threads:
            t.start()
        for t in threads:
            t.join()
        print("完成！文件已存入%s"%'output\\'+d+'.txt')
    

if __name__=='__main__':
    main()

San_shi

live and learn

自编子域名获取脚本

原理

源文件