Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ particular, you should think about these options:
in spider mode.
- `MaxDepth`: Only URLs fewer links than `MaxDepth` from the `From`
list will be crawled.
- `MaxPages`: Limit the spider to only this number of pages. 0 means unlimited.
- `WaitTime`: Pause time between spawning requests. Approximates crawl
rate. For instance, to crawl about 5 URLs per second, set this to
"200ms". It uses Go's [time parsing
Expand Down
1 change: 1 addition & 0 deletions config.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
"Exclude": [],

"MaxDepth": 3,
"MaxPages": 0,

"WaitTime": "100ms",
"Connections": 20,
Expand Down
6 changes: 5 additions & 1 deletion crawl.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ func doList() {
func doCrawl(c *crawler.Crawler) {
count, lastCount := 0, 0
lastUpdate := time.Now()
startCrawl := time.Now()
err := c.Start()
if err != nil {
// FIXME: need a way to signal error
Expand All @@ -156,7 +157,10 @@ func doCrawl(c *crawler.Crawler) {
}
}

log.Printf("crawl complete, %d URLs total", count)
totalTime := int(time.Since(startCrawl).Seconds())
rate := count / totalTime

log.Printf("crawl complete, %d URLs total in %d seconds (~%d/sec)", count, totalTime, rate )
}

func listFromReader(in io.Reader) []string {
Expand Down
1 change: 1 addition & 0 deletions crawler/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
var defaultCrawler = Crawler{
Connections: 1,
MaxDepth: 0,
MaxPages: 0,
UserAgent: version.UserAgent(),
RobotsUserAgent: "Crawler",

Expand Down
67 changes: 38 additions & 29 deletions crawler/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,19 @@ import (
"github.com/benjaminestes/robots/v2"
)

type queuePair struct {
url resolvedURL
depth int
}

type resolvedURL string

func (u resolvedURL) String() string {
return string(u)
}

func (c *Crawler) initialQueue() ([]resolvedURL, error) {
var result []resolvedURL
func (c *Crawler) initialQueue() ([]queuePair, error) {
var result []queuePair
for _, s := range c.From {
u, err := url.Parse(s)
if err != nil {
Expand All @@ -35,7 +40,7 @@ func (c *Crawler) initialQueue() ([]resolvedURL, error) {
if u.Path == "" {
u.Path = "/"
}
result = append(result, resolvedURL(u.String()))
result = append(result, queuePair{resolvedURL(u.String()), 1})
}
return result, nil
}
Expand All @@ -50,24 +55,23 @@ type Crawler struct {
From []string
RespectNofollow bool
MaxDepth int
MaxPages int
WaitTime string
Timeout string
Header []*data.Pair

depth int
queue []resolvedURL
// mu guards queue when multiple fetches
// to it simultaneously
mu sync.Mutex
queue []queuePair

seen map[resolvedURL]bool
results chan *data.Result

// robots maintains a robots.txt matcher for every encountered
// domain
robots map[string]func(string) bool

// mu guards nextqueue when multiple fetches may try to write
// to it simultaneously
nextqueue []resolvedURL
mu sync.Mutex

// wg waits for all spawned fetches to complete before
// crawling the next level
wg sync.WaitGroup
Expand Down Expand Up @@ -114,7 +118,7 @@ func initializedClient(c *Crawler) *http.Client {
// If Start returns a non-nil error, calls to Next will fail.
func (c *Crawler) Start() error {
var err error

if c.wait, err = time.ParseDuration(c.WaitTime); err != nil {
return err
}
Expand All @@ -140,8 +144,8 @@ func (c *Crawler) Start() error {
// crawled. Therefore, we add all URLs from the initial queue
// to the set of URLs that have been seen, before the crawl
// starts.
for _, addr := range c.queue {
c.seen[addr] = true
for _, queuePair := range c.queue {
c.seen[queuePair.url] = true
}

c.results = make(chan *data.Result, c.Connections)
Expand Down Expand Up @@ -198,7 +202,7 @@ func (c *Crawler) willCrawl(fullurl resolvedURL) bool {
// out in order ascending by depth. Within a "level" of depth, there is
// no guarantee as to which URLs will be crawled first.
//
// Result objects are suitable for Marshling into JSON format and conform
// Result objects are suitable for Marshaling into JSON format and conform
// to the schema exported by the crawler.Schema package.
func (c *Crawler) Next() *data.Result {
node, ok := <-c.results
Expand All @@ -217,10 +221,10 @@ func (c *Crawler) resetWait() {
// crawled. In other words, it assembles the URLs that represent the
// next level of the crawl. Many merges could be simultaneously
// active.
func (c *Crawler) merge(links []*data.Link) {
func (c *Crawler) merge(links []*data.Link, qp queuePair) {
// This is how the crawler terminates — it will encounter an
// empty queue if no URLs have been added to the next queue.
if !(c.depth < c.MaxDepth) {
if !(qp.depth < c.MaxDepth) {
return
}
for _, link := range links {
Expand All @@ -239,9 +243,14 @@ func (c *Crawler) merge(links []*data.Link) {
// mutated after it is initialized.
c.mu.Lock()
if _, ok := c.seen[linkURL]; !ok {
if c.MaxPages > 0 && len(c.seen) >= c.MaxPages {
c.mu.Unlock() // Release the lock
break // Break early
}

if !(link.Nofollow && c.RespectNofollow) {
c.seen[linkURL] = true
c.nextqueue = append(c.nextqueue, linkURL)
c.queue = append(c.queue, queuePair{linkURL, qp.depth + 1})
}
}
c.mu.Unlock()
Expand All @@ -251,32 +260,32 @@ func (c *Crawler) merge(links []*data.Link) {
// fetch requests a URL, hydrates a result object based on its
// contents, if any, and initiates a merge of the links discovered in
// the process.
func (c *Crawler) fetch(addr resolvedURL) {
resp, err := requestAsCrawler(c, addr)
func (c *Crawler) fetch(queuePair queuePair) {
resp, err := requestAsCrawler(c, queuePair)
if err != nil {
// FIXME: Should this panic? Under what conditions would this fail?
return
}
defer resp.Body.Close()
result := data.MakeResult(addr.String(), c.depth, resp)

result := data.MakeResult(queuePair.url.String(), queuePair.depth, resp)

if resp != nil && resp.StatusCode >= 300 && resp.StatusCode < 400 {
c.merge([]*data.Link{
&data.Link{
Address: result.ResolvesTo,
},
})
}, queuePair)
}

c.merge(result.Links)
c.merge(result.Links, queuePair)
c.results <- result
}

// addRobots creates a robots.txt matcher from a URL string. If there
// is a problem reading from robots.txt, treat it as a server error.
func (c *Crawler) addRobots(u resolvedURL) {
resp, err := requestAsCrawler(c, u)
resp, err := requestAsCrawler(c, queuePair{u, 0})
if err != nil {
rtxt, _ := robots.From(503, nil)
c.robots[u.String()] = rtxt.Tester(c.RobotsUserAgent)
Expand All @@ -294,17 +303,17 @@ func (c *Crawler) addRobots(u resolvedURL) {
c.robots[u.String()] = rtxt.Tester(c.RobotsUserAgent)
}

//
func requestAsCrawler(c *Crawler, u resolvedURL) (*http.Response, error) {
req, err := http.NewRequest("GET", u.String(), nil)
//
func requestAsCrawler(c *Crawler, qp queuePair) (*http.Response, error) {
req, err := http.NewRequest("GET", qp.url.String(), nil)
if err != nil {
return nil, err
}

req.Header.Set("User-Agent", c.UserAgent)
for _, h := range c.Header {
req.Header.Add(h.K, h.V)
}

return c.client.Do(req)
}
23 changes: 9 additions & 14 deletions crawler/state.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ func crawlWait(c *Crawler) crawlfn {
// URL to be requested. If we get here, it means we've already decided
// the URL is in the scope of the crawl as defined by the end user.
func crawlCheckRobots(c *Crawler) crawlfn {
addr := c.queue[0]
qp := c.queue[0]
addr := qp.url
rtxtURL, err := robots.Locate(addr.String())
if err != nil {
// FIXME: Couldn't parse URL. Is this the desired behavior?
Expand All @@ -55,7 +56,7 @@ func crawlCheckRobots(c *Crawler) crawlfn {
}
if !c.robots[rtxtURL](addr.String()) {
// FIXME: Can this be some sort of "emit error" func?
result := data.MakeResult(addr.String(), c.depth, nil)
result := data.MakeResult(addr.String(), qp.depth, nil)
result.Status = "Blocked by robots.txt"
c.results <- result
return crawlNext
Expand All @@ -67,7 +68,7 @@ func crawlCheckRobots(c *Crawler) crawlfn {
// determined to try to crawl. The next step is to secure resources to
// actually crawl the URL, and initiate fetching.
func crawlDo(c *Crawler) crawlfn {
addr := c.queue[0]
queuePair := c.queue[0]
// This blocks when there are = c.Connections fetches active.
// Otherwise, it secures a token.
c.connections <- true
Expand All @@ -80,7 +81,7 @@ func crawlDo(c *Crawler) crawlfn {
// ultimately the extraction of the links on the
// crawled page. Merging of newly discovered URLs
// happens as part of this call.
c.fetch(addr)
c.fetch(queuePair)
}()
return crawlNext
}
Expand All @@ -89,10 +90,14 @@ func crawlDo(c *Crawler) crawlfn {
// more URLs in the current queue, we wait for all currently active
// fetches to complete.
func crawlNext(c *Crawler) crawlfn {
c.mu.Lock()
c.queue = c.queue[1:]
c.mu.Unlock()

if len(c.queue) > 0 {
return crawlStart
}

return crawlAwait
}

Expand All @@ -101,15 +106,5 @@ func crawlNext(c *Crawler) crawlfn {
// level by level.
func crawlAwait(c *Crawler) crawlfn {
c.wg.Wait()
return crawlNextQueue
}

// crawlNextQueue replace the current queue with the next and starts
// the process again. This next queue represents the accumulated URLs
// in the next level of the crawl that we haven't yet seen.
func crawlNextQueue(c *Crawler) crawlfn {
c.queue = c.nextqueue
c.nextqueue = nil
c.depth++
return crawlStartQueue
}