当前位置:网站首页>Go crawler framework -colly actual combat (III) -- panoramic cartoon picture capture and download

Go crawler framework -colly actual combat (III) -- panoramic cartoon picture capture and download

2022-06-25 00:17:00 You're like an ironclad treasure

Original link :Hzy Blog

Try to take advantage of today colly Grab a photo website and download it , It's still fun .

Next, go directly to the code .
The complete code can be found in my github On , Will always update some learning go Some of the small problems , Write some small examples !
github

Be careful :

  • Need to add cookies, Otherwise access will be denied .
  • One request seems to be at most 200 A picture , It is useless to adjust the parameters in time , So we have to cycle .
  • Logic , Use a collector , Grab page , Using another collector to , Download the pictures .
package main

import (
	"bytes"
	"encoding/json"
	"fmt"
	"github.com/gocolly/colly"
	"github.com/gocolly/colly/extensions"
	"io"
	"net/url"
	"os"
	"strings"
	"time"
)

//todo: Use colly Crawling https://www.quanjing.com Cartoon pictures in 

/*
1. First F12, Observe https://www.quanjing.com/search.aspx?q=%E5%8D%A1%E9%80%9A#%E5%8D%A1%E9%80%9A||1|1000|3|2|||||| This website 
2. You will find that this is actually a pass json To load data ! Through this URL:https://www.quanjing.com/Handler/SearchUrl.ashx
3.  Let's see what parameters are passed , Just follow it .
*/

func main(){
	t :=time.Now()
	c :=colly.NewCollector(func(collector *colly.Collector) {
		collector.Async=true
		extensions.RandomUserAgent(collector)
	})
	imageC :=c.Clone()
    //  Request header 
	c.OnRequest(func(r *colly.Request) {
		r.Headers.Set("Cookie","BIGipServerPools_Web_ssl=2135533760.47873.0000; Hm_lvt_c01558ab05fd344e898880e9fc1b65c4=1577432018; qimo_seosource_578c8dc0-6fab-11e8-ab7a-fda8d0606763=%E7%BB%94%E6%AC%8F%E5%94%B4; qimo_seokeywords_578c8dc0-6fab-11e8-ab7a-fda8d0606763=; accessId=578c8dc0-6fab-11e8-ab7a-fda8d0606763; pageViewNum=3; Hm_lpvt_c01558ab05fd344e898880e9fc1b65c4=1577432866")
		r.Headers.Add("referer", "https://www.quanjing.com/search.aspx?q=%E5%8D%A1%E9%80%9A")
		r.Headers.Add("sec-fetch-mode", "cors")
		r.Headers.Add("sec-fetch-site", "same-origin")
		r.Headers.Add("accept", "text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01")
		r.Headers.Add("accept-encoding", "gzip, deflate, br")
		r.Headers.Add("accept-language", "en,zh-CN;q=0.9,zh;q=0.8")
		r.Headers.Add("X-Requested-With", "XMLHttpRequest")
	})
    // Construct pictures url, Let the picture imageC Collector to download pictures 
	c.OnResponse(func(r *colly.Response) {
		var f interface{}
		if err := json.Unmarshal(r.Body[13:len(r.Body)-1], &f);err!=nil{
			panic(err)
		}
		imgList := f.(map[string]interface{})["imglist"]
		for k,img :=range imgList.([]interface{}){
			url :=img.(map[string]interface{})["imgurl"].(string)
			url = url +"#"+img.(map[string]interface{})["caption"].(string)
			fmt.Printf("find -->%d:%s\n",k,url)
			imageC.Visit(url)
		}
	})
	c.OnError(func(response *colly.Response, err error) {
		fmt.Println(err)
	})
    //  According to the picture url To download pictures 
	imageC.OnResponse(func(r *colly.Response) {
		fileName :=""
		caption :=strings.Split(r.Request.URL.String(),"#") //  Get just # The following information 
		if len(caption)>=2{ // Here we need to judge the situation without information , Or the slice will cross the line 
			fileName =caption[1] +".jpg"
		}else{
			fileName = " Unknown "
		}
		res, err := url.QueryUnescape(fileName) //  Yes url Format conversion , Otherwise I can't understand 
		fileName = strings.Replace(res,",","_",-1)//  Replace all commas in the message with the next line , Comma file naming will cause an error .
		fmt.Printf(" download  -->%s \n",fileName)
		f, err := os.Create("./download/"+fileName)
		if err != nil {
			panic(err)
		}
		io.Copy(f, bytes.NewReader(r.Body))
	})
	// structure URL
	pageSize:= 200 // Number of images to download ,
	pageNum :=10
	for i:=0;i<pageNum;i++{
		url :=fmt.Sprintf("https://www.quanjing.com/Handler/SearchUrl.ashx?t=1952&callback=searchresult&q= cartoon A&stype=1&pagesize=%d&pagenum=%d&imageType=2&imageColor=&brand=&imageSType=&fr=1&sortFlag=1&imageUType=&btype=&authid=&_=1577435470818",pageSize,i)
		_ = c.Visit(url)
	}

	c.Wait()
	imageC.Wait()
	fmt.Printf("done,cost:%s\n",time.Since(t))
}

design sketch

 Insert picture description here

Tomorrow, , Let's have a look colly What are the fun little projects !

原网站

版权声明
本文为[You're like an ironclad treasure]所创,转载请带上原文链接,感谢
https://yzsam.com/2022/02/202202210551199103.html