From 0056b572d15d6f34de23b53c2ad6a17d7653787a Mon Sep 17 00:00:00 2001 From: William Vicary Date: Sun, 9 Feb 2020 21:24:50 +0000 Subject: [PATCH 1/2] Changed to a persistent queue from a per-level queue, to avoid slowing down the queue at each level. - The queue uses a struct which maintains awareness of the depth. - Concept of nextqueue removed - Crawler is no longer aware of the current depth --- crawler/crawler.go | 59 ++++++++++++++++++++++++---------------------- crawler/state.go | 23 +++++++----------- 2 files changed, 40 insertions(+), 42 deletions(-) diff --git a/crawler/crawler.go b/crawler/crawler.go index 768d6fc..c46cf8d 100644 --- a/crawler/crawler.go +++ b/crawler/crawler.go @@ -17,14 +17,19 @@ import ( "github.com/benjaminestes/robots/v2" ) +type queuePair struct { + url resolvedURL + depth int +} + type resolvedURL string func (u resolvedURL) String() string { return string(u) } -func (c *Crawler) initialQueue() ([]resolvedURL, error) { - var result []resolvedURL +func (c *Crawler) initialQueue() ([]queuePair, error) { + var result []queuePair for _, s := range c.From { u, err := url.Parse(s) if err != nil { @@ -35,7 +40,7 @@ func (c *Crawler) initialQueue() ([]resolvedURL, error) { if u.Path == "" { u.Path = "/" } - result = append(result, resolvedURL(u.String())) + result = append(result, queuePair{resolvedURL(u.String()), 1}) } return result, nil } @@ -54,8 +59,11 @@ type Crawler struct { Timeout string Header []*data.Pair - depth int - queue []resolvedURL + // mu guards queue when multiple fetches + // to it simultaneously + mu sync.Mutex + queue []queuePair + seen map[resolvedURL]bool results chan *data.Result @@ -63,11 +71,6 @@ type Crawler struct { // domain robots map[string]func(string) bool - // mu guards nextqueue when multiple fetches may try to write - // to it simultaneously - nextqueue []resolvedURL - mu sync.Mutex - // wg waits for all spawned fetches to complete before // crawling the next level wg sync.WaitGroup @@ -114,7 +117,7 @@ func initializedClient(c *Crawler) *http.Client { // If Start returns a non-nil error, calls to Next will fail. func (c *Crawler) Start() error { var err error - + if c.wait, err = time.ParseDuration(c.WaitTime); err != nil { return err } @@ -140,8 +143,8 @@ func (c *Crawler) Start() error { // crawled. Therefore, we add all URLs from the initial queue // to the set of URLs that have been seen, before the crawl // starts. - for _, addr := range c.queue { - c.seen[addr] = true + for _, queuePair := range c.queue { + c.seen[queuePair.url] = true } c.results = make(chan *data.Result, c.Connections) @@ -217,10 +220,10 @@ func (c *Crawler) resetWait() { // crawled. In other words, it assembles the URLs that represent the // next level of the crawl. Many merges could be simultaneously // active. -func (c *Crawler) merge(links []*data.Link) { +func (c *Crawler) merge(links []*data.Link, qp queuePair) { // This is how the crawler terminates — it will encounter an // empty queue if no URLs have been added to the next queue. - if !(c.depth < c.MaxDepth) { + if !(qp.depth < c.MaxDepth) { return } for _, link := range links { @@ -241,7 +244,7 @@ func (c *Crawler) merge(links []*data.Link) { if _, ok := c.seen[linkURL]; !ok { if !(link.Nofollow && c.RespectNofollow) { c.seen[linkURL] = true - c.nextqueue = append(c.nextqueue, linkURL) + c.queue = append(c.queue, queuePair{linkURL, qp.depth + 1}) } } c.mu.Unlock() @@ -251,32 +254,32 @@ func (c *Crawler) merge(links []*data.Link) { // fetch requests a URL, hydrates a result object based on its // contents, if any, and initiates a merge of the links discovered in // the process. -func (c *Crawler) fetch(addr resolvedURL) { - resp, err := requestAsCrawler(c, addr) +func (c *Crawler) fetch(queuePair queuePair) { + resp, err := requestAsCrawler(c, queuePair) if err != nil { // FIXME: Should this panic? Under what conditions would this fail? return } defer resp.Body.Close() - - result := data.MakeResult(addr.String(), c.depth, resp) + + result := data.MakeResult(queuePair.url.String(), queuePair.depth, resp) if resp != nil && resp.StatusCode >= 300 && resp.StatusCode < 400 { c.merge([]*data.Link{ &data.Link{ Address: result.ResolvesTo, }, - }) + }, queuePair) } - c.merge(result.Links) + c.merge(result.Links, queuePair) c.results <- result } // addRobots creates a robots.txt matcher from a URL string. If there // is a problem reading from robots.txt, treat it as a server error. func (c *Crawler) addRobots(u resolvedURL) { - resp, err := requestAsCrawler(c, u) + resp, err := requestAsCrawler(c, queuePair{u, 0}) if err != nil { rtxt, _ := robots.From(503, nil) c.robots[u.String()] = rtxt.Tester(c.RobotsUserAgent) @@ -294,17 +297,17 @@ func (c *Crawler) addRobots(u resolvedURL) { c.robots[u.String()] = rtxt.Tester(c.RobotsUserAgent) } -// -func requestAsCrawler(c *Crawler, u resolvedURL) (*http.Response, error) { - req, err := http.NewRequest("GET", u.String(), nil) +// +func requestAsCrawler(c *Crawler, qp queuePair) (*http.Response, error) { + req, err := http.NewRequest("GET", qp.url.String(), nil) if err != nil { return nil, err } - + req.Header.Set("User-Agent", c.UserAgent) for _, h := range c.Header { req.Header.Add(h.K, h.V) } - + return c.client.Do(req) } diff --git a/crawler/state.go b/crawler/state.go index 0053c7c..52cb2fa 100644 --- a/crawler/state.go +++ b/crawler/state.go @@ -44,7 +44,8 @@ func crawlWait(c *Crawler) crawlfn { // URL to be requested. If we get here, it means we've already decided // the URL is in the scope of the crawl as defined by the end user. func crawlCheckRobots(c *Crawler) crawlfn { - addr := c.queue[0] + qp := c.queue[0] + addr := qp.url rtxtURL, err := robots.Locate(addr.String()) if err != nil { // FIXME: Couldn't parse URL. Is this the desired behavior? @@ -55,7 +56,7 @@ func crawlCheckRobots(c *Crawler) crawlfn { } if !c.robots[rtxtURL](addr.String()) { // FIXME: Can this be some sort of "emit error" func? - result := data.MakeResult(addr.String(), c.depth, nil) + result := data.MakeResult(addr.String(), qp.depth, nil) result.Status = "Blocked by robots.txt" c.results <- result return crawlNext @@ -67,7 +68,7 @@ func crawlCheckRobots(c *Crawler) crawlfn { // determined to try to crawl. The next step is to secure resources to // actually crawl the URL, and initiate fetching. func crawlDo(c *Crawler) crawlfn { - addr := c.queue[0] + queuePair := c.queue[0] // This blocks when there are = c.Connections fetches active. // Otherwise, it secures a token. c.connections <- true @@ -80,7 +81,7 @@ func crawlDo(c *Crawler) crawlfn { // ultimately the extraction of the links on the // crawled page. Merging of newly discovered URLs // happens as part of this call. - c.fetch(addr) + c.fetch(queuePair) }() return crawlNext } @@ -89,10 +90,14 @@ func crawlDo(c *Crawler) crawlfn { // more URLs in the current queue, we wait for all currently active // fetches to complete. func crawlNext(c *Crawler) crawlfn { + c.mu.Lock() c.queue = c.queue[1:] + c.mu.Unlock() + if len(c.queue) > 0 { return crawlStart } + return crawlAwait } @@ -101,15 +106,5 @@ func crawlNext(c *Crawler) crawlfn { // level by level. func crawlAwait(c *Crawler) crawlfn { c.wg.Wait() - return crawlNextQueue -} - -// crawlNextQueue replace the current queue with the next and starts -// the process again. This next queue represents the accumulated URLs -// in the next level of the crawl that we haven't yet seen. -func crawlNextQueue(c *Crawler) crawlfn { - c.queue = c.nextqueue - c.nextqueue = nil - c.depth++ return crawlStartQueue } From 75214f044c177113d5e1325ff919ef93b4d33645 Mon Sep 17 00:00:00 2001 From: William Vicary Date: Wed, 19 Feb 2020 21:59:51 +0000 Subject: [PATCH 2/2] Added MaxPages configuration option Added the ability to limit a crawl to only MaxPages as defined in the configuration file. A value of 0 is treated as infinite. --- README.md | 1 + config.json | 1 + crawl.go | 6 +++++- crawler/config.go | 1 + crawler/crawler.go | 8 +++++++- 5 files changed, 15 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index cc0cc12..174b9c4 100644 --- a/README.md +++ b/README.md @@ -106,6 +106,7 @@ particular, you should think about these options: in spider mode. - `MaxDepth`: Only URLs fewer links than `MaxDepth` from the `From` list will be crawled. +- `MaxPages`: Limit the spider to only this number of pages. 0 means unlimited. - `WaitTime`: Pause time between spawning requests. Approximates crawl rate. For instance, to crawl about 5 URLs per second, set this to "200ms". It uses Go's [time parsing diff --git a/config.json b/config.json index cdaa429..119b8ea 100644 --- a/config.json +++ b/config.json @@ -8,6 +8,7 @@ "Exclude": [], "MaxDepth": 3, + "MaxPages": 0, "WaitTime": "100ms", "Connections": 20, diff --git a/crawl.go b/crawl.go index 91be764..2e00a9d 100644 --- a/crawl.go +++ b/crawl.go @@ -138,6 +138,7 @@ func doList() { func doCrawl(c *crawler.Crawler) { count, lastCount := 0, 0 lastUpdate := time.Now() + startCrawl := time.Now() err := c.Start() if err != nil { // FIXME: need a way to signal error @@ -156,7 +157,10 @@ func doCrawl(c *crawler.Crawler) { } } - log.Printf("crawl complete, %d URLs total", count) + totalTime := int(time.Since(startCrawl).Seconds()) + rate := count / totalTime + + log.Printf("crawl complete, %d URLs total in %d seconds (~%d/sec)", count, totalTime, rate ) } func listFromReader(in io.Reader) []string { diff --git a/crawler/config.go b/crawler/config.go index e298ee8..3ba33dc 100644 --- a/crawler/config.go +++ b/crawler/config.go @@ -11,6 +11,7 @@ import ( var defaultCrawler = Crawler{ Connections: 1, MaxDepth: 0, + MaxPages: 0, UserAgent: version.UserAgent(), RobotsUserAgent: "Crawler", diff --git a/crawler/crawler.go b/crawler/crawler.go index c46cf8d..cf51551 100644 --- a/crawler/crawler.go +++ b/crawler/crawler.go @@ -55,6 +55,7 @@ type Crawler struct { From []string RespectNofollow bool MaxDepth int + MaxPages int WaitTime string Timeout string Header []*data.Pair @@ -201,7 +202,7 @@ func (c *Crawler) willCrawl(fullurl resolvedURL) bool { // out in order ascending by depth. Within a "level" of depth, there is // no guarantee as to which URLs will be crawled first. // -// Result objects are suitable for Marshling into JSON format and conform +// Result objects are suitable for Marshaling into JSON format and conform // to the schema exported by the crawler.Schema package. func (c *Crawler) Next() *data.Result { node, ok := <-c.results @@ -242,6 +243,11 @@ func (c *Crawler) merge(links []*data.Link, qp queuePair) { // mutated after it is initialized. c.mu.Lock() if _, ok := c.seen[linkURL]; !ok { + if c.MaxPages > 0 && len(c.seen) >= c.MaxPages { + c.mu.Unlock() // Release the lock + break // Break early + } + if !(link.Nofollow && c.RespectNofollow) { c.seen[linkURL] = true c.queue = append(c.queue, queuePair{linkURL, qp.depth + 1})