go爬虫框架go-colly之抓取instagram篇

package main
import (
“crypto/md5”
“encoding/json”
“fmt”
“log”
“net/url”
“os”
“regexp”
“strings”
“github.com/gocolly/colly/v2”
)
// “id”: user id, “after”: end cursor
const nextPageURL string = `https://www.instagram.com/graphql/query/?query_hash=%s&variables=%s`
const nextPagePayload string = `{“id”:”%s”,”first”:50,”after”:”%s”}`
var requestID string
var requestIds [][]byte
var queryIdPattern = regexp.MustCompile(`queryId:”.{32}”`)
type pageInfo struct {
EndCursor string `json:”end_cursor”`
NextPage bool `json:”has_next_page”`
}
type mainPageData struct {
Rhxgis string `json:”rhx_gis”`
EntryData struct {
ProfilePage []struct {
Graphql struct {
User struct {
Id string `json:”id”`
Media struct {
Edges []struct {
Node struct {
ImageURL string `json:”display_url”`
ThumbnailURL string `json:”thumbnail_src”`
IsVideo bool `json:”is_video”`
Date int `json:”date”`
Dimensions struct {
Width int `json:”width”`
Height int `json:”height”`
} `json:”dimensions”`
} `json::node”`
} `json:”edges”`
PageInfo pageInfo `json:”page_info”`
} `json:”edge_owner_to_timeline_media”`
} `json:”user”`
} `json:”graphql”`
} `json:”ProfilePage”`
} `json:”entry_data”`
}
type nextPageData struct {
Data struct {
User struct {
Container struct {
PageInfo pageInfo `json:”page_info”`
Edges []struct {
Node struct {
ImageURL string `json:”display_url”`
ThumbnailURL string `json:”thumbnail_src”`
IsVideo bool `json:”is_video”`
Date int `json:”taken_at_timestamp”`
Dimensions struct {
Width int `json:”width”`
Height int `json:”height”`
}
}
} `json:”edges”`
} `json:”edge_owner_to_timeline_media”`
}
} `json:”data”`
}
func main() {
if len(os.Args) != 2 {
log.Println(“Missing account name argument”)
os.Exit(1)
}
var actualUserId string
instagramAccount := os.Args[1]
outputDir := fmt.Sprintf(“./instagram_%s/”, instagramAccount)
c := colly.NewCollector(
//colly.CacheDir(“./_instagram_cache/”),
colly.UserAgent(“Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36”),
)
c.OnRequest(func(r *colly.Request) {
r.Headers.Set(“X-Requested-With”, “XMLHttpRequest”)
r.Headers.Set(“Referer”, “https://www.instagram.com/”+instagramAccount)
if r.Ctx.Get(“gis”) != “” {
gis := fmt.Sprintf(“%s:%s”, r.Ctx.Get(“gis”), r.Ctx.Get(“variables”))
h := md5.New()
h.Write([]byte(gis))
gisHash := fmt.Sprintf(“%x”, h.Sum(nil))
r.Headers.Set(“X-Instagram-GIS”, gisHash)
}
})
c.OnHTML(“html”, func(e *colly.HTMLElement) {
d := c.Clone()
d.OnResponse(func(r *colly.Response) {
requestIds = queryIdPattern.FindAll(r.Body, 1)
requestID = string(requestIds[1][9:41])
})
requestIDURL := e.Request.AbsoluteURL(e.ChildAttr(`link[as=”script”]`, “href”))
d.Visit(requestIDURL)
dat := e.ChildText(“body > script:first-of-type”)
jsonData := dat[strings.Index(dat, “{“) : len(dat)1]
data := &mainPageData{}
err := json.Unmarshal([]byte(jsonData), data)
if err != nil {
log.Fatal(err)
}
log.Println(“saving output to “, outputDir)
os.MkdirAll(outputDir, os.ModePerm)
page := data.EntryData.ProfilePage[0]
actualUserId = page.Graphql.User.Id
for _, obj := range page.Graphql.User.Media.Edges {
// skip videos
if obj.Node.IsVideo {
continue
}
c.Visit(obj.Node.ImageURL)
}
nextPageVars := fmt.Sprintf(nextPagePayload, actualUserId, page.Graphql.User.Media.PageInfo.EndCursor)
e.Request.Ctx.Put(“variables”, nextPageVars)
if page.Graphql.User.Media.PageInfo.NextPage {
u := fmt.Sprintf(
nextPageURL,
requestID,
url.QueryEscape(nextPageVars),
)
log.Println(“Next page found”, u)
e.Request.Ctx.Put(“gis”, data.Rhxgis)
e.Request.Visit(u)
}
})
c.OnError(func(r *colly.Response, e error) {
log.Println(“error:”, e, r.Request.URL, string(r.Body))
})
c.OnResponse(func(r *colly.Response) {
if strings.Index(r.Headers.Get(“Content-Type”), “image”) > 1 {
r.Save(outputDir + r.FileName())
return
}
if strings.Index(r.Headers.Get(“Content-Type”), “json”) == 1 {
return
}
data := &nextPageData{}
err := json.Unmarshal(r.Body, data)
if err != nil {
log.Fatal(err)
}
for _, obj := range data.Data.User.Container.Edges {
// skip videos
if obj.Node.IsVideo {
continue
}
c.Visit(obj.Node.ImageURL)
}
if data.Data.User.Container.PageInfo.NextPage {
nextPageVars := fmt.Sprintf(nextPagePayload, actualUserId, data.Data.User.Container.PageInfo.EndCursor)
r.Request.Ctx.Put(“variables”, nextPageVars)
u := fmt.Sprintf(
nextPageURL,
requestID,
url.QueryEscape(nextPageVars),
)
log.Println(“Next page found”, u)
r.Request.Visit(u)
}
})
c.Visit(“https://instagram.com/” + instagramAccount)
}

相关推荐

在这里插入图片描述

Navicat导入sql报错[Err] 1046 – No database selected

ReactNative源码分析之UI事件分发

使用 sklearn 完成对模型分类性能的评估 Educoder

在这里插入图片描述

故宫博物馆爬虫(简略版)