go爬虫框架go-colly之抓取reddit篇

package main
import (
“fmt”
“os”
“time”
“github.com/gocolly/colly/v2”
)
type item struct {
StoryURL string
Source string
comments string
CrawledAt time.Time
Comments string
Title string
}
func main() {
stories := []item{}
// Instantiate default collector
c := colly.NewCollector(
// Visit only domains: old.reddit.com
colly.AllowedDomains(“old.reddit.com”),
colly.Async(),
)
// On every a element which has .top-matter attribute call callback
// This class is unique to the div that holds all information about a story
c.OnHTML(“.top-matter”, func(e *colly.HTMLElement) {
temp := item{}
temp.StoryURL = e.ChildAttr(“a[data-event-action=title]”, “href”)
temp.Source = “https://old.reddit.com/r/programming/”
temp.Title = e.ChildText(“a[data-event-action=title]”)
temp.Comments = e.ChildAttr(“a[data-event-action=comments]”, “href”)
temp.CrawledAt = time.Now()
stories = append(stories, temp)
})
// On every span tag with the class next-button
c.OnHTML(“span.next-button”, func(h *colly.HTMLElement) {
t := h.ChildAttr(“a”, “href”)
c.Visit(t)
})
// Set max Parallelism and introduce a Random Delay
c.Limit(&colly.LimitRule{
Parallelism: 2,
RandomDelay: 5 * time.Second,
})
// Before making a request print “Visiting …”
c.OnRequest(func(r *colly.Request) {
fmt.Println(“Visiting”, r.URL.String())
})
// Crawl all reddits the user passes in
reddits := os.Args[1:]
for _, reddit := range reddits {
c.Visit(reddit)
}
c.Wait()
fmt.Println(stories)
}

相关推荐

在这里插入图片描述

pandas:sort_values排序后concat”失效“问题解决

在这里插入图片描述

【每日爬虫】:给自己打造一个温馨的家,面朝大海,春暖花开

解决SpringDataJPA报错:org.hibernate.HibernateException: Access to DialectResolutionInfo cannot be null w

【求助!!!】selenium如何不生成geckodriver.log文件