Colly学习笔记

黑麋鹿收录于类别 Go

2023-04-11 2023-04-11 约 3889 字预计阅读 8 分钟

colly实例项目

https://github.com/hakluke/hakrawler，可搜索学习

go语言爬虫框架Colly学习

https://darjun.github.io/2021/06/30/godailylib/colly/ 以下代码是自己结合ChatGPT的回答与测试的结果。

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149


package main

import (
	"fmt"
	"log"
	"net/http"
	"net/http/cookiejar"
	"net/url"
	"strings"
	"time"

	"github.com/gocolly/colly/v2"
	"github.com/gocolly/colly/v2/debug"
)

// 创建一个新的collector
var c = colly.NewCollector( // 启用异步请求
	colly.Async(true),
	colly.AllowURLRevisit(),
	// 附加调试器
	colly.Debugger(&debug.WebDebugger{}),
	// 下面是用法和区别
	// 1. LogDebugger
	// colly.Debugger(&debug.LogDebugger{})
	// 控制台输入日志形如：
	// [000068] 1 [    66 - responseHeaders] map["status":"OK" "url":"https://httpbin.org/post"] (744.2505ms)
	// [000067] 1 [     1 - responseHeaders] map["status":"OK" "url":"https://httpbin.org/cookies?param1=value1&param2=value2"] (744.2505ms)

	// 2. WebDebug,用网页展示请求状态，默认访问路径http://127.0.0.1:7676/
	// colly.Debugger(&debug.WebDebugger{}),
	// 结果形如：
	// Current Requests (1)#                                  Finished Requests (115)
	// https://httpbin.org/post                               https://httpbin.org/post
	// Collector #1 - 2023-04-12T08:00:01.0840252+08:00       Collector #1 - 64.4904635s
)

func init() {
	initialCookie := &http.Cookie{
		Name:   "initial_cookie",
		Value:  "initial_value",
		Domain: "httpbin.org",
	}

	// 创建一个新的cookiejar
	jar, _ := cookiejar.New(nil)
	hosturl, _ := url.Parse("https://httpbin.org")
	jar.SetCookies(hosturl, []*http.Cookie{initialCookie})

	// 创建一个自定义的http.Client，以使用设置好的cookiejar
	client := &http.Client{
		Jar: jar,
	}

	// 将自定义的http.Client设置为collector的客户端
	c.SetClient(client)

	// 使用OnRequest回调为所有请求设置初始headers
	initialHeaders := map[string]string{
		"User-Agent":    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36",
		"Accept-Custom": "en-US,en;q=0.9",
	}

	// 设置代理
	err := c.SetProxy("http://127.0.0.1:8889")
	if err != nil {
		log.Fatal("设置代理失败:", err)
	}

	// 设置并发限制
	c.Limit(&colly.LimitRule{
		DomainGlob:  "*",
		Parallelism: 5, // 设置并发数
		RandomDelay: 5 * time.Second,
	})

	// 使用OnRequest回调处理请求
	c.OnRequest(func(r *colly.Request) {
		if r.Method == "GET" {
			fmt.Println("👉正准备发起GET请求，当前处于请求前中间件函数")
		} else if r.Method == "POST" {
			fmt.Println("👆正准备发起POST请求，当前处于请求前中间件函数")
		}

		// 设置初始headers
		for k, v := range initialHeaders {
			r.Headers.Set(k, v)
		}

		// 根据URL路径临时修改headers
		if strings.Contains(r.URL.Path, "headers") {
			r.Headers.Set("Custom-Header", "Custom-Value")
		}
	})

}
func main() {
	// 设置访问的URL
	urls := []string{"https://httpbin.org/cookies", "https://httpbin.org/html", "https://httpbin.org/headers"}

	// 某一个URL请求成功后，Colly将会自动调用该函数进行响应的处理，此处写响应处理逻辑。
	c.OnResponse(func(r *colly.Response) {
		// 根据请求URL路径，选择不同的处理方法，此处嵌套调用c.OnHTML
		url := r.Request.URL.String() // 请求的URL
		if strings.Contains(url, "https://httpbin.org/html") {
			c.OnHTML("body h1", func(e *colly.HTMLElement) {
				fmt.Println("Processing HTML response...", e.Text)
			})
		} else {
			// 在这里处理其他类型的响应
			fmt.Println("Processing non-HTML response...", string(r.Body))
		}
		// fmt.Println(time.Now().Format("2006-01-02 15:04:05"), "Finished Visited URL:", r.Request.URL, r.Request.Method)
		// if r.Request.Method == "POST" {
		// 	fmt.Println(r.Request.Body)
		// }
		// fmt.Println("Response body:", string(r.Body))
	})

	// 使用Visit方法开始抓取(Visit方法只能实现GET请求) 同时也是并发测试
	for _, baseURL := range urls {
		// 设置查询参数
		queryParams := url.Values{}
		queryParams.Add("param1", "value1")
		queryParams.Add("param2", "value2")

		// 将查询参数附加到基本URL
		requestURL := fmt.Sprintf("%s?%s", baseURL, queryParams.Encode())
		err := c.Visit(requestURL)
		if err != nil {
			fmt.Println("Visit error:", err)
		}
	}

	// POST请求  同时也是并发测试
	for i := 1; i < 10; i += 2 {
		postURL := "https://httpbin.org/post"
		postData := map[string]string{
			"param1": fmt.Sprintf("%d", i),
			"param2": "value2",
		}

		err := c.Post(postURL, postData)
		if err != nil {
			log.Println("POST request error:", err)
		}
	}
	// 等待所有请求完成——开启异步时需要该设置
	c.Wait()
}

输出结果（部分省略）

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20


C:\Users\admin\Desktop\Colly学习>go run .
👉正准备发起GET请求，当前处于请求前中间件函数
👆正准备发起POST请求，当前处于请求前中间件函数
👉正准备发起GET请求，当前处于请求前中间件函数
👆正准备发起POST请求，当前处于请求前中间件函数
👆正准备发起POST请求，当前处于请求前中间件函数
👆正准备发起POST请求，当前处于请求前中间件函数
👆正准备发起POST请求，当前处于请求前中间件函数
👆正准备发起POST请求，当前处于请求前中间件函数
👆正准备发起POST请求，当前处于请求前中间件函数
👉正准备发起GET请求，当前处于请求前中间件函数
2023-04-11 23:02:46 Finished Visited URL: https://httpbin.org/post POST
2023-04-11 23:02:46 Finished Visited URL: https://httpbin.org/cookies?param1=value1&param2=value2 GET
2023-04-11 23:02:46 Finished Visited URL: https://httpbin.org/headers?param1=value1&param2=value2 GET
2023-04-11 23:02:47 Finished Visited URL: https://httpbin.org/post POST
2023-04-11 23:02:47 Finished Visited URL: https://httpbin.org/post POST
2023-04-11 23:02:47 Finished Visited URL: https://httpbin.org/cookies?param1=value1&param2=value2 GET
2023-04-11 23:02:47 Finished Visited URL: https://httpbin.org/cookies?param1=value1&param2=value2 GET
2023-04-11 23:02:47 Finished Visited URL: https://httpbin.org/cookies?param1=value1&param2=value2 GET
C:\Users\admin\Desktop\Colly学习>

自定义参数的传递

请求中间件中开始传递[意义不大]

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34


package main

import (
	"fmt"
	"github.com/gocolly/colly/v2"
)

func main() {
	// 初始化collector
	c := colly.NewCollector()

	// 设置访问url
	url := "https://www.httpbin.org/headers"

	// 在请求之前处理
	c.OnRequest(func(r *colly.Request) {
		fmt.Println("Visiting", r.URL.String())
		// 使用Context传递自定义参数
		r.Ctx.Put("customData", "Hello, I'm custom data!")
	})

	// 处理响应
	c.OnResponse(func(r *colly.Response) {
		// 从Context中获取自定义参数
		customData := r.Ctx.Get("customData")
		fmt.Printf("获取传递的自定义参数: %s\n", customData)
	})

	// 开始爬取
	err := c.Visit(url)
	if err != nil {
		fmt.Println("Error:", err)
	}
}

请求中间件中先解析请求的各项参数，再进行传递，下面以解析POST表单参数示例

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18


	c.OnRequest(func(r *colly.Request) {
		if r.Method == "POST" {
				contentType := r.Headers.Get("Content-Type")
				if strings.Contains(contentType, "application/x-www-form-urlencoded") {
					// 解析表单数据
					form, err := url.ParseQuery(string(r.Body))
					if err != nil {
						log.Println("Error parsing form data:", err)
						return
					}

					// 假设我们想要读取名为 "param1" 的参数
					if param1 := form.Get("param1"); param1 != "" {
						r.Ctx.Put("param1", param1)
					}
				}
	}
}

利用Context计算请求耗时

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48


package main

import (
	"fmt"
	"time"

	"github.com/gocolly/colly/v2"
)

func main() {
	// 初始化collector
	c := colly.NewCollector()

	// 设置访问url
	url := "https://www.163.com"

	// 在请求之前处理
	c.OnRequest(func(r *colly.Request) {
		fmt.Println("Visiting", r.URL.String())

		// 记录请求开始时间
		startTime := time.Now()

		// 使用Context传递请求开始时间，将时间转换为字符串
		r.Ctx.Put("startTime", startTime.Format(time.RFC3339Nano))
	})

	// 处理响应
	c.OnResponse(func(r *colly.Response) {
		// 从Context中获取请求开始时间，将字符串转换为time.Time类型
		startTimeStr := r.Ctx.Get("startTime")
		startTime, err := time.Parse(time.RFC3339Nano, startTimeStr)
		if err != nil {
			fmt.Println("Error:", err)
			return
		}

		// 计算请求耗时
		duration := time.Since(startTime)
		fmt.Printf("Response received, duration: %v\n", duration)
	})

	// 开始爬取
	err := c.Visit(url)
	if err != nil {
		fmt.Println("Error:", err)
	}
}

在Visit或Post方法中传递自定义参数给Onresponse方法

这种情形用于每个不同的请求，自定义参数给下一个环节的response处理。缺点是对于带表单的POST请求不友好，本来可以map[string]string简洁构造请求表单，然后colly会自动帮你底层实现编码等…现在需要手动进行编码等。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49


package main

import (
	"fmt"
	"log"
	"net/http"
	"net/url"
	"strings"

	"github.com/gocolly/colly/v2"
)

func main() {
	c := colly.NewCollector(colly.Async(true))

	// 设置响应处理函数
	c.OnResponse(func(r *colly.Response) {
		log.Println("Response received:", string(r.Body))
		fmt.Println("ctx from request", r.Ctx.Get("customData"))
	})
	c.OnRequest(func(r *colly.Request) {
		fmt.Println("Visiting", r.URL.String())
	})
	// 准备POST请求的数据
	postData := map[string]string{
		"param1": "value1",
		"param2": "value2",
	}
	postDataStr := encodePostParams(postData)
	// 创建一个新的Context
	ctx := colly.NewContext()
	ctx.Put("customData", "Hello, I'm custom data!")
	// 发送POST请求
	c.Request("POST", "http://httpbin.org/post", strings.NewReader(postDataStr), ctx, http.Header{
		"Content-Type": []string{"application/x-www-form-urlencoded"},
	})

	// 等待请求完成
	c.Wait()
}

// 将map转换为URL编码的字符串
func encodePostParams(params map[string]string) string {
	values := url.Values{}
	for key, value := range params {
		values.Set(key, value)
	}
	return values.Encode()
}

不过,如果任务足够复杂或具有不同类型的子任务，建议对一个抓取作业使用多个Collector。比如一个解析列表视图并处理分页，另一个收集详细信息。使用多个Collector的中文例子:https://darjun.github.io/2021/06/30/godailylib/colly/ 官方例子：https://go-colly.org/docs/best_practices/multi_collector/

全局自定义请求和单个自定义请求

在全局收集器中设置 Cookies

在这种方法中，您可以在创建收集器时设置默认的 Header，这些 Header 将应用于收集器发出的所有请求。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24


package main

import (
	"fmt"
	"github.com/gocolly/colly"
)

func main() {
	c := colly.NewCollector()

	// 设置全局 Header
	c.OnRequest(func(r *colly.Request) {
		r.Headers.Set("Cookie", "name=value; anothername=anothervalue")
	})

	c.OnResponse(func(r *colly.Response) {
		fmt.Println("Visited", r.Request.URL)
	})

	err := c.Visit("http://httpbin.org/cookies")
	if err != nil {
		panic(err)
	}
}

在单个请求中设置 Cookies

如果您只想在特定的请求中设置 cookies，而不是在收集器的所有请求中，您可以在调用 Visit 方法或其他请求方法之前设置 Header。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20


package main

import (
	"fmt"
	"github.com/gocolly/colly"
)

func main() {
	c := colly.NewCollector()

	c.OnResponse(func(r *colly.Response) {
		fmt.Println("Visited", r.Request.URL)
	})

	// 设置单个请求的 Header
	err := c.Request("GET", "http://httpbin.org/cookies", nil, nil, colly.NewHttpHeader().Set("Cookie", "name=value; anothername=anothervalue"))
	if err != nil {
		panic(err)
	}
}

禁止跟进跳转及响应头捕获

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37


package main

import (
	"log"
	"net/http"

	"github.com/gocolly/colly/v2"
)

func main() {
	c := colly.NewCollector()

	c.SetRedirectHandler(func(req *http.Request, via []*http.Request) error {
		log.Println("Redirecting to", req.URL) // 打印准备重定向的链接
		return http.ErrUseLastResponse // 禁止跳转
		// req.Header.Set("Cookie", "JSESSIONID") // 跳转前设置
		// return nil //跳转前设置后返回空，不禁止跳转
	})

	c.OnResponseHeaders(func(r *colly.Response) {
		fmt.Println("Response Code:\t", r.StatusCode)
		fmt.Println("Response Headers:")
		for k, v := range *r.Headers {
			fmt.Println("\t", k, ":", v)
		}
	})
	c.Visit("https://httpbin.org/redirect/6")
}
// Response Code:   302
// Response Headers:
//          Access-Control-Allow-Credentials : [true]
//          Date : [Fri, 08 Dec 2023 04:33:50 GMT]
//          Content-Type : [text/html; charset=utf-8]
//          Content-Length : [247]
//          Server : [gunicorn/19.9.0]
//          Location : [/relative-redirect/5]
//          Access-Control-Allow-Origin : [*]

关于如何捕获302响应头，请参考这个issue

禁止和不禁止`Redirect`的影响

禁止

OnResponseHeaders能够捕获到响应头。即使禁止跳转，仍然无法使用OnResponse或者OnHTML捕获响应体，推测OnResponse或者OnHTML仅当StatusCode == 200时才会执行。

不禁止

OnResponseHeaders不能捕获到响应头。 OnRequest也无法捕获到任何请求信息，貌似自动跳转的请求绕过了OnRequest。但是SetRedirectHandler能捕获到请求信息，比如请求的URL。

结论

OnRequest只对人工指定的请求生效，不包括程序自动处理的，比如下面的OnRequest是生效的，因为Visit方法的请求都是人工指定的。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26


package main

import (
	"fmt"
	"net/http"

	"github.com/gocolly/colly/v2"
)

func main() {
	//1.Login Action Start
	c := colly.NewCollector()

	c.SetRedirectHandler(func(r *http.Request, via []*http.Request) error {
		return http.ErrUseLastResponse // 禁止跳转
	})
	c.OnRequest(func(r *colly.Request) {
		fmt.Println(r.URL.String())
	})
	c.OnResponseHeaders(func(r *colly.Response) {
		fmt.Println("Response Code:\t", r.StatusCode)
		redirect_url := "https://httpbin.org" + r.Headers.Get("Location")
		c.Visit(redirect_url)
	})
	c.Visit("https://httpbin.org/redirect/10")
}

OnResponse、OnHTML等回调函数的执行顺序

追踪c.Visit->c.scrape->c.fetch发现依次是： OnRequest OnError OnResponse OnHTML OnXML OnScraped

JSON请求

1
2
3
4
5
6


payload := []byte(`{"user":{"email":"anon@example.com","password":"mypassword"}}`)
c := colly.NewCollector()
c.OnRequest(func(r *colly.Request) {
	r.Headers.Set("Content-Type", "application/json;charset=UTF-8")
})
err := c.PostRaw(URL, payload)

遇到错误时停止所有请求

使用变量stop来标记是否停止，实现类似同步串行执行的效果，以下代码待测试：

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28


package main

import (
	"github.com/gocolly/colly/v2"
	"log"

)

func main() {
	var stop bool

	c := colly.NewCollector()

	// 在请求之前检查是否应该停止进一步的请求
	c.OnRequest(func(r *colly.Request) {
		if stop {
			r.Abort()
		}
	})

	c.OnError(func(r *colly.Response, err error) {
		log.Printf("Request URL:%s failed with response:%v and error:%s\n", r.Request.URL, r, err)
		stop = true
	})


	c.Visit("http://example.com")
}

目录