MCPcopy
hub / github.com/pingc0y/URLFinder / Spider

Function Spider

crawler/crawler.go:17–139  ·  view source on GitHub ↗

蜘蛛抓取页面内容

(u string, num int)

Source from the content-addressed store, hash-verified

15
16// 蜘蛛抓取页面内容
17func Spider(u string, num int) {
18 is := true
19 defer func() {
20 config.Wg.Done()
21 if is {
22 <-config.Ch
23 }
24
25 }()
26 config.Mux.Lock()
27 fmt.Printf("\rStart %d Spider...", config.Progress)
28 config.Progress++
29 config.Mux.Unlock()
30 //标记完成
31
32 u, _ = url.QueryUnescape(u)
33 if num > 1 && cmd.D != "" && !util.RegexpMatch(cmd.D, u) {
34 return
35 }
36 if GetEndUrl(u) {
37 return
38 }
39 if cmd.M == 3 {
40 for _, v := range config.Risks {
41 if strings.Contains(u, v) {
42 return
43 }
44 }
45 }
46 AppendEndUrl(u)
47 request, err := http.NewRequest("GET", u, nil)
48 if err != nil {
49 return
50 }
51
52 request.Header.Set("Accept-Encoding", "gzip") //使用gzip压缩传输数据让访问更快
53 request.Header.Set("User-Agent", util.GetUserAgent())
54 request.Header.Set("Accept", "*/*")
55 //增加header选项
56 if cmd.C != "" {
57 request.Header.Set("Cookie", cmd.C)
58 }
59 //加载yaml配置(headers)
60 if cmd.I {
61 util.SetHeadersConfig(&request.Header)
62 }
63
64 //处理返回结果
65 //tr := &http.Transport{
66 // TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
67 //}
68 //client = &http.Client{Timeout: time.Duration(cmd.TI) * time.Second,
69 // Transport: tr,
70 // CheckRedirect: func(req *http.Request, via []*http.Request) error {
71 // if len(via) >= 10 {
72 // return fmt.Errorf("Too many redirects")
73 // }
74 // if len(via) > 0 {

Callers 4

jsFindFunction · 0.85
urlFindFunction · 0.85
startFFFunction · 0.85
startFunction · 0.85

Calls 9

RegexpMatchFunction · 0.92
GetUserAgentFunction · 0.92
SetHeadersConfigFunction · 0.92
ReadAllLimitedFunction · 0.92
GetEndUrlFunction · 0.85
AppendEndUrlFunction · 0.85
jsFindFunction · 0.85
urlFindFunction · 0.85
infoFindFunction · 0.85

Tested by

no test coverage detected