From 0056b572d15d6f34de23b53c2ad6a17d7653787a Mon Sep 17 00:00:00 2001
From: William Vicary <will@3whitehats.com>
Date: Sun, 9 Feb 2020 21:24:50 +0000
Subject: [PATCH 1/2] Changed to a persistent queue from a per-level queue, to
 avoid slowing down the queue at each level. - The queue uses a struct which
 maintains awareness of the depth. - Concept of nextqueue removed - Crawler is
 no longer aware of the current depth

---
 crawler/crawler.go | 59 ++++++++++++++++++++++++----------------------
 crawler/state.go   | 23 +++++++-----------
 2 files changed, 40 insertions(+), 42 deletions(-)

diff --git a/crawler/crawler.go b/crawler/crawler.go
index 768d6fc..c46cf8d 100644
--- a/crawler/crawler.go
+++ b/crawler/crawler.go
@@ -17,14 +17,19 @@ import (
 	"github.com/benjaminestes/robots/v2"
 )
 
+type queuePair struct {
+	url   resolvedURL
+	depth int
+}
+
 type resolvedURL string
 
 func (u resolvedURL) String() string {
 	return string(u)
 }
 
-func (c *Crawler) initialQueue() ([]resolvedURL, error) {
-	var result []resolvedURL
+func (c *Crawler) initialQueue() ([]queuePair, error) {
+	var result []queuePair
 	for _, s := range c.From {
 		u, err := url.Parse(s)
 		if err != nil {
@@ -35,7 +40,7 @@ func (c *Crawler) initialQueue() ([]resolvedURL, error) {
 		if u.Path == "" {
 			u.Path = "/"
 		}
-		result = append(result, resolvedURL(u.String()))
+		result = append(result, queuePair{resolvedURL(u.String()), 1})
 	}
 	return result, nil
 }
@@ -54,8 +59,11 @@ type Crawler struct {
 	Timeout         string
 	Header          []*data.Pair
 
-	depth   int
-	queue   []resolvedURL
+	// mu guards queue when multiple fetches
+	// to it simultaneously
+	mu    sync.Mutex
+	queue []queuePair
+
 	seen    map[resolvedURL]bool
 	results chan *data.Result
 
@@ -63,11 +71,6 @@ type Crawler struct {
 	// domain
 	robots map[string]func(string) bool
 
-	// mu guards nextqueue when multiple fetches may try to write
-	// to it simultaneously
-	nextqueue []resolvedURL
-	mu        sync.Mutex
-
 	// wg waits for all spawned fetches to complete before
 	// crawling the next level
 	wg sync.WaitGroup
@@ -114,7 +117,7 @@ func initializedClient(c *Crawler) *http.Client {
 // If Start returns a non-nil error, calls to Next will fail.
 func (c *Crawler) Start() error {
 	var err error
-	
+
 	if c.wait, err = time.ParseDuration(c.WaitTime); err != nil {
 		return err
 	}
@@ -140,8 +143,8 @@ func (c *Crawler) Start() error {
 	// crawled. Therefore, we add all URLs from the initial queue
 	// to the set of URLs that have been seen, before the crawl
 	// starts.
-	for _, addr := range c.queue {
-		c.seen[addr] = true
+	for _, queuePair := range c.queue {
+		c.seen[queuePair.url] = true
 	}
 
 	c.results = make(chan *data.Result, c.Connections)
@@ -217,10 +220,10 @@ func (c *Crawler) resetWait() {
 // crawled.  In other words, it assembles the URLs that represent the
 // next level of the crawl. Many merges could be simultaneously
 // active.
-func (c *Crawler) merge(links []*data.Link) {
+func (c *Crawler) merge(links []*data.Link, qp queuePair) {
 	// This is how the crawler terminates — it will encounter an
 	// empty queue if no URLs have been added to the next queue.
-	if !(c.depth < c.MaxDepth) {
+	if !(qp.depth < c.MaxDepth) {
 		return
 	}
 	for _, link := range links {
@@ -241,7 +244,7 @@ func (c *Crawler) merge(links []*data.Link) {
 		if _, ok := c.seen[linkURL]; !ok {
 			if !(link.Nofollow && c.RespectNofollow) {
 				c.seen[linkURL] = true
-				c.nextqueue = append(c.nextqueue, linkURL)
+				c.queue = append(c.queue, queuePair{linkURL, qp.depth + 1})
 			}
 		}
 		c.mu.Unlock()
@@ -251,32 +254,32 @@ func (c *Crawler) merge(links []*data.Link) {
 // fetch requests a URL, hydrates a result object based on its
 // contents, if any, and initiates a merge of the links discovered in
 // the process.
-func (c *Crawler) fetch(addr resolvedURL) {
-	resp, err := requestAsCrawler(c, addr)
+func (c *Crawler) fetch(queuePair queuePair) {
+	resp, err := requestAsCrawler(c, queuePair)
 	if err != nil {
 		// FIXME: Should this panic? Under what conditions would this fail?
 		return
 	}
 	defer resp.Body.Close()
-	
-	result := data.MakeResult(addr.String(), c.depth, resp)
+
+	result := data.MakeResult(queuePair.url.String(), queuePair.depth, resp)
 
 	if resp != nil && resp.StatusCode >= 300 && resp.StatusCode < 400 {
 		c.merge([]*data.Link{
 			&data.Link{
 				Address: result.ResolvesTo,
 			},
-		})
+		}, queuePair)
 	}
 
-	c.merge(result.Links)
+	c.merge(result.Links, queuePair)
 	c.results <- result
 }
 
 // addRobots creates a robots.txt matcher from a URL string. If there
 // is a problem reading from robots.txt, treat it as a server error.
 func (c *Crawler) addRobots(u resolvedURL) {
-	resp, err := requestAsCrawler(c, u)
+	resp, err := requestAsCrawler(c, queuePair{u, 0})
 	if err != nil {
 		rtxt, _ := robots.From(503, nil)
 		c.robots[u.String()] = rtxt.Tester(c.RobotsUserAgent)
@@ -294,17 +297,17 @@ func (c *Crawler) addRobots(u resolvedURL) {
 	c.robots[u.String()] = rtxt.Tester(c.RobotsUserAgent)
 }
 
-// 
-func requestAsCrawler(c *Crawler, u resolvedURL) (*http.Response, error) {
-	req, err := http.NewRequest("GET", u.String(), nil)
+//
+func requestAsCrawler(c *Crawler, qp queuePair) (*http.Response, error) {
+	req, err := http.NewRequest("GET", qp.url.String(), nil)
 	if err != nil {
 		return nil, err
 	}
-	
+
 	req.Header.Set("User-Agent", c.UserAgent)
 	for _, h := range c.Header {
 		req.Header.Add(h.K, h.V)
 	}
-	
+
 	return c.client.Do(req)
 }
diff --git a/crawler/state.go b/crawler/state.go
index 0053c7c..52cb2fa 100644
--- a/crawler/state.go
+++ b/crawler/state.go
@@ -44,7 +44,8 @@ func crawlWait(c *Crawler) crawlfn {
 // URL to be requested. If we get here, it means we've already decided
 // the URL is in the scope of the crawl as defined by the end user.
 func crawlCheckRobots(c *Crawler) crawlfn {
-	addr := c.queue[0]
+	qp := c.queue[0]
+	addr := qp.url
 	rtxtURL, err := robots.Locate(addr.String())
 	if err != nil {
 		// FIXME: Couldn't parse URL. Is this the desired behavior?
@@ -55,7 +56,7 @@ func crawlCheckRobots(c *Crawler) crawlfn {
 	}
 	if !c.robots[rtxtURL](addr.String()) {
 		// FIXME: Can this be some sort of "emit error" func?
-		result := data.MakeResult(addr.String(), c.depth, nil)
+		result := data.MakeResult(addr.String(), qp.depth, nil)
 		result.Status = "Blocked by robots.txt"
 		c.results <- result
 		return crawlNext
@@ -67,7 +68,7 @@ func crawlCheckRobots(c *Crawler) crawlfn {
 // determined to try to crawl. The next step is to secure resources to
 // actually crawl the URL, and initiate fetching.
 func crawlDo(c *Crawler) crawlfn {
-	addr := c.queue[0]
+	queuePair := c.queue[0]
 	// This blocks when there are = c.Connections fetches active.
 	// Otherwise, it secures a token.
 	c.connections <- true
@@ -80,7 +81,7 @@ func crawlDo(c *Crawler) crawlfn {
 		// ultimately the extraction of the links on the
 		// crawled page. Merging of newly discovered URLs
 		// happens as part of this call.
-		c.fetch(addr)
+		c.fetch(queuePair)
 	}()
 	return crawlNext
 }
@@ -89,10 +90,14 @@ func crawlDo(c *Crawler) crawlfn {
 // more URLs in the current queue, we wait for all currently active
 // fetches to complete.
 func crawlNext(c *Crawler) crawlfn {
+	c.mu.Lock()
 	c.queue = c.queue[1:]
+	c.mu.Unlock()
+
 	if len(c.queue) > 0 {
 		return crawlStart
 	}
+
 	return crawlAwait
 }
 
@@ -101,15 +106,5 @@ func crawlNext(c *Crawler) crawlfn {
 // level by level.
 func crawlAwait(c *Crawler) crawlfn {
 	c.wg.Wait()
-	return crawlNextQueue
-}
-
-// crawlNextQueue replace the current queue with the next and starts
-// the process again. This next queue represents the accumulated URLs
-// in the next level of the crawl that we haven't yet seen.
-func crawlNextQueue(c *Crawler) crawlfn {
-	c.queue = c.nextqueue
-	c.nextqueue = nil
-	c.depth++
 	return crawlStartQueue
 }

From 75214f044c177113d5e1325ff919ef93b4d33645 Mon Sep 17 00:00:00 2001
From: William Vicary <will@3whitehats.com>
Date: Wed, 19 Feb 2020 21:59:51 +0000
Subject: [PATCH 2/2] Added MaxPages configuration option

Added the ability to limit a crawl to only MaxPages as defined in the configuration file. A value of 0 is treated as infinite.
---
 README.md          | 1 +
 config.json        | 1 +
 crawl.go           | 6 +++++-
 crawler/config.go  | 1 +
 crawler/crawler.go | 8 +++++++-
 5 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index cc0cc12..174b9c4 100644
--- a/README.md
+++ b/README.md
@@ -106,6 +106,7 @@ particular, you should think about these options:
     in spider mode.
 - `MaxDepth`: Only URLs fewer links than `MaxDepth` from the `From`
     list will be crawled.
+- `MaxPages`: Limit the spider to only this number of pages. 0 means unlimited.
 - `WaitTime`: Pause time between spawning requests. Approximates crawl
     rate.  For instance, to crawl about 5 URLs per second, set this to
     "200ms". It uses Go's [time parsing
diff --git a/config.json b/config.json
index cdaa429..119b8ea 100644
--- a/config.json
+++ b/config.json
@@ -8,6 +8,7 @@
     "Exclude": [],
 
     "MaxDepth": 3,
+    "MaxPages": 0,
 
     "WaitTime": "100ms",
     "Connections": 20,
diff --git a/crawl.go b/crawl.go
index 91be764..2e00a9d 100644
--- a/crawl.go
+++ b/crawl.go
@@ -138,6 +138,7 @@ func doList() {
 func doCrawl(c *crawler.Crawler) {
 	count, lastCount := 0, 0
 	lastUpdate := time.Now()
+	startCrawl := time.Now()
 	err := c.Start()
 	if err != nil {
 		// FIXME: need a way to signal error
@@ -156,7 +157,10 @@ func doCrawl(c *crawler.Crawler) {
 		}
 	}
 
-	log.Printf("crawl complete, %d URLs total", count)
+	totalTime := int(time.Since(startCrawl).Seconds())
+	rate := count / totalTime
+
+	log.Printf("crawl complete, %d URLs total in %d seconds (~%d/sec)", count, totalTime, rate )
 }
 
 func listFromReader(in io.Reader) []string {
diff --git a/crawler/config.go b/crawler/config.go
index e298ee8..3ba33dc 100644
--- a/crawler/config.go
+++ b/crawler/config.go
@@ -11,6 +11,7 @@ import (
 var defaultCrawler = Crawler{
 	Connections:     1,
 	MaxDepth:        0,
+	MaxPages:		 0,
 	UserAgent:       version.UserAgent(),
 	RobotsUserAgent: "Crawler",
 
diff --git a/crawler/crawler.go b/crawler/crawler.go
index c46cf8d..cf51551 100644
--- a/crawler/crawler.go
+++ b/crawler/crawler.go
@@ -55,6 +55,7 @@ type Crawler struct {
 	From            []string
 	RespectNofollow bool
 	MaxDepth        int
+	MaxPages		int
 	WaitTime        string
 	Timeout         string
 	Header          []*data.Pair
@@ -201,7 +202,7 @@ func (c *Crawler) willCrawl(fullurl resolvedURL) bool {
 // out in order ascending by depth. Within a "level" of depth, there is
 // no guarantee as to which URLs will be crawled first.
 //
-// Result objects are suitable for Marshling into JSON format and conform
+// Result objects are suitable for Marshaling into JSON format and conform
 // to the schema exported by the crawler.Schema package.
 func (c *Crawler) Next() *data.Result {
 	node, ok := <-c.results
@@ -242,6 +243,11 @@ func (c *Crawler) merge(links []*data.Link, qp queuePair) {
 		// mutated after it is initialized.
 		c.mu.Lock()
 		if _, ok := c.seen[linkURL]; !ok {
+			if c.MaxPages > 0 && len(c.seen) >= c.MaxPages {
+				c.mu.Unlock() // Release the lock
+				break // Break early
+			}
+
 			if !(link.Nofollow && c.RespectNofollow) {
 				c.seen[linkURL] = true
 				c.queue = append(c.queue, queuePair{linkURL, qp.depth + 1})