diff --git a/.github/workflows/lint-test.yml b/.github/workflows/lint-test.yml index e5c32df..e22e212 100644 --- a/.github/workflows/lint-test.yml +++ b/.github/workflows/lint-test.yml @@ -58,9 +58,6 @@ jobs: permissions: contents: read runs-on: ubuntu-24.04 - strategy: - matrix: - traefik: [latest] steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 @@ -68,7 +65,7 @@ jobs: run: go run test.go working-directory: ./ci env: - TRAEFIK_TAG: ${{ matrix.traefik }} + TRAEFIK_TAG: latest - name: cleanup if: ${{ always() }} diff --git a/ci/docker-compose.yml b/ci/docker-compose.yml index 82a51fd..bfc2024 100644 --- a/ci/docker-compose.yml +++ b/ci/docker-compose.yml @@ -26,6 +26,7 @@ services: traefik.http.middlewares.captcha-protect.plugin.captcha-protect.enableStateReconciliation: "true" healthcheck: test: curl -fs http://localhost/healthz | grep -q OK || exit 1 + start_period: 5s volumes: - ./conf/nginx/default.conf:/etc/nginx/conf.d/default.conf:r networks: @@ -57,6 +58,7 @@ services: traefik.http.middlewares.captcha-protect.plugin.captcha-protect.enableStateReconciliation: "true" healthcheck: test: curl -fs http://localhost/healthz | grep -q OK || exit 1 + start_period: 5s volumes: - ./conf/nginx/default.conf:/etc/nginx/conf.d/default.conf:r networks: @@ -71,7 +73,8 @@ services: --api.debug=true --ping=true --entryPoints.http.address=:80 - --entryPoints.http.forwardedHeaders.trustedIPs=10.0.0.0/8,172.16.0.0/12,192.168.0.0/16 + --entryPoints.http.forwardedHeaders.insecure=true + --entryPoints.http.forwardedHeaders.trustedIPs=127.0.0.1/32,::1/128,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16 --providers.docker=true --providers.docker.network=default --experimental.localPlugins.captcha-protect.moduleName=github.com/libops/captcha-protect @@ -90,6 +93,9 @@ services: - traefik healthcheck: test: traefik healthcheck --ping + start_period: 5s depends_on: nginx: condition: service_healthy + nginx2: + condition: service_healthy diff --git a/ci/test.go b/ci/test.go index b810d33..7f25399 100755 --- a/ci/test.go +++ b/ci/test.go @@ -28,9 +28,9 @@ const parallelism = 10 func main() { log := slog.New(slog.NewTextHandler(os.Stdout, nil)) - googleCIDRs, err := helper.FetchGooglebotIPs(log, http.DefaultClient, "https://developers.google.com/static/search/apis/ipranges/googlebot.json") + googleCIDRs, err := helper.FetchGoogleCrawlerIPs(log, http.DefaultClient, helper.GoogleCrawlerIPRangeURLs) if err != nil { - slog.Error("unable to fetch google bot ips", "err", err) + slog.Error("unable to fetch google crawler ips", "err", err) os.Exit(1) } @@ -56,18 +56,18 @@ func main() { runCommand("docker", "compose", "up", "-d") waitForService("http://localhost") waitForService("http://localhost/app2") + waitForGoogleExemptionReady(googleCIDRs) fmt.Printf("Making sure %d attempt(s) pass\n", rateLimit) runParallelChecks(ips, rateLimit, "http://localhost") - - time.Sleep(cp.StateSaveInterval + cp.StateSaveJitter + (1 * time.Second)) - runCommand("jq", ".", "tmp/state.json") + statePath := "./tmp/state.json" + runCommand("jq", ".", statePath) fmt.Printf("Making sure attempt #%d causes a redirect to the challenge page\n", rateLimit+1) ensureRedirect(ips, "http://localhost") fmt.Println("\nTesting state sharing between nginx instances...") - time.Sleep(cp.StateSaveInterval + cp.StateSaveJitter + (1 * time.Second)) + time.Sleep(cp.StateSaveInterval + cp.StateSaveJitter + (5 * time.Second)) testStateSharing(ips) testGoogleBotGetsThrough(googleCIDRs) @@ -81,7 +81,7 @@ func main() { time.Sleep(10 * time.Second) checkStateReload() - runCommand("rm", "tmp/state.json") + runCommand("rm", "-f", statePath) } @@ -147,7 +147,7 @@ func runParallelChecks(ips []string, rateLimit int, url string) { var wg sync.WaitGroup sem := make(chan struct{}, parallelism) - for i := 0; i < rateLimit; i++ { + for range rateLimit { for _, ip := range ips { wg.Add(1) sem <- struct{}{} @@ -305,7 +305,7 @@ func checkStateReload() { os.Exit(1) } - if len(botsMap) != numIPs { + if len(botsMap) < numIPs { slog.Error("Unexpected number of bots", "expected", numIPs, "received", len(botsMap)) os.Exit(1) } @@ -400,7 +400,7 @@ func testGoogleBotGetsThrough(googleCIDRs []string) { // Prime the rate limiter for the GoogleBot IP with parameters fmt.Printf("Priming rate limiter for GoogleBot IP %s with params (%d requests)\n", googleIP, rateLimit) - for i := 0; i < rateLimit; i++ { + for i := range rateLimit { output = httpRequest(googleIP, "http://localhost/?foo=bar") // Assign value if output != "" { slog.Error(fmt.Sprintf("GoogleBot with params was challenged prematurely on request #%d", i+1), "ip", googleIP, "output", output) @@ -421,3 +421,45 @@ func testGoogleBotGetsThrough(googleCIDRs []string) { // set things back to normal for other tests runCommand("docker", "compose", "down") } + +func waitForGoogleExemptionReady(googleCIDRs []string) { + googleIP, err := firstUsableIPv4FromCIDRs(googleCIDRs) + if err != nil { + slog.Warn("Unable to select Google IP for readiness check; skipping warmup", "err", err) + return + } + + deadline := time.Now().Add(90 * time.Second) + for time.Now().Before(deadline) { + ready := true + for i := 0; i < rateLimit+1; i++ { + if output := httpRequest(googleIP, "http://localhost"); output != "" { + ready = false + break + } + } + if ready { + fmt.Printf("Google exemption is active for %s\n", googleIP) + return + } + time.Sleep(500 * time.Millisecond) + } + + slog.Error("Timed out waiting for Google crawler IP exemption to become active", "googleIP", googleIP) + os.Exit(1) +} + +func firstUsableIPv4FromCIDRs(cidrs []string) (string, error) { + for _, cidr := range cidrs { + ip, err := getIPFromCIDR(cidr) + if err != nil { + continue + } + parsed := net.ParseIP(ip) + if parsed != nil && parsed.To4() != nil { + return ip, nil + } + } + + return "", fmt.Errorf("no usable IPv4 found in CIDR list") +} diff --git a/internal/helper/google.go b/internal/helper/google.go index 6412531..ac82d5a 100644 --- a/internal/helper/google.go +++ b/internal/helper/google.go @@ -7,10 +7,19 @@ import ( "log/slog" "net" "net/http" + "net/netip" + "sort" "sync" "time" ) +var GoogleCrawlerIPRangeURLs = []string{ + "https://developers.google.com/static/search/apis/ipranges/googlebot.json", + "https://developers.google.com/static/crawling/ipranges/common-crawlers.json", + "https://developers.google.com/static/crawling/ipranges/special-crawlers.json", + "https://developers.google.com/static/crawling/ipranges/user-triggered-fetchers-google.json", +} + // GooglebotIPs holds the list of Googlebot IP ranges, providing a thread-safe way to check if an IP is a Googlebot. type GooglebotIPs struct { cidrs []*net.IPNet @@ -108,3 +117,89 @@ func FetchGooglebotIPs(log *slog.Logger, httpClient *http.Client, url string) ([ return cidrs, nil } + +// FetchGoogleCrawlerIPs fetches crawler IP ranges from multiple Google-managed endpoints, +// then returns a canonical, unique list where broader prefixes replace narrower prefixes. +func FetchGoogleCrawlerIPs(log *slog.Logger, httpClient *http.Client, urls []string) ([]string, error) { + if len(urls) == 0 { + return nil, nil + } + + allCIDRs := make([]string, 0) + for _, url := range urls { + cidrs, err := FetchGooglebotIPs(log, httpClient, url) + if err != nil { + return nil, err + } + allCIDRs = append(allCIDRs, cidrs...) + } + + return ReduceCIDRs(allCIDRs, log), nil +} + +// RefreshGoogleCrawlerIPs fetches crawler IPs from all configured URLs and updates +// the provided GooglebotIPs set. Returns the number of CIDRs loaded. +func RefreshGoogleCrawlerIPs(log *slog.Logger, httpClient *http.Client, target *GooglebotIPs, urls []string) (int, error) { + cidrs, err := FetchGoogleCrawlerIPs(log, httpClient, urls) + if err != nil { + return 0, err + } + + target.Update(cidrs, log) + + return len(cidrs), nil +} + +// ReduceCIDRs canonicalizes CIDRs, removes exact duplicates, and removes narrower +// ranges when they are fully covered by broader ranges. +func ReduceCIDRs(cidrs []string, log *slog.Logger) []string { + prefixes := make([]netip.Prefix, 0, len(cidrs)) + for _, cidr := range cidrs { + prefix, err := netip.ParsePrefix(cidr) + if err != nil { + if log != nil { + log.Error("error parsing CIDR", "cidr", cidr, "err", err) + } + continue + } + prefixes = append(prefixes, prefix.Masked()) + } + + sort.Slice(prefixes, func(i, j int) bool { + a := prefixes[i] + b := prefixes[j] + + aIs4 := a.Addr().Is4() + bIs4 := b.Addr().Is4() + if aIs4 != bIs4 { + return aIs4 + } + + if a.Bits() != b.Bits() { + return a.Bits() < b.Bits() + } + + return a.Addr().Compare(b.Addr()) < 0 + }) + + reduced := make([]netip.Prefix, 0, len(prefixes)) + for _, candidate := range prefixes { + covered := false + for _, existing := range reduced { + if existing.Bits() <= candidate.Bits() && existing.Contains(candidate.Addr()) { + covered = true + break + } + } + if !covered { + reduced = append(reduced, candidate) + } + } + + result := make([]string, 0, len(reduced)) + for _, prefix := range reduced { + result = append(result, prefix.String()) + } + + return result +} diff --git a/internal/helper/google_test.go b/internal/helper/google_test.go index 2c81185..955d9bc 100644 --- a/internal/helper/google_test.go +++ b/internal/helper/google_test.go @@ -6,6 +6,7 @@ import ( "net/http" "net/http/httptest" "os" + "reflect" "testing" ) @@ -77,3 +78,117 @@ func TestFetchGooglebotIPs(t *testing.T) { } } } + +func TestReduceCIDRs(t *testing.T) { + log := slog.New(slog.NewTextHandler(os.Stdout, nil)) + + input := []string{ + "8.8.8.0/24", + "8.8.8.0/25", + "8.8.8.128/25", + "8.8.8.0/24", // duplicate + "2001:4860::/32", + "2001:4860:1234::/48", + } + + got := ReduceCIDRs(input, log) + want := []string{"8.8.8.0/24", "2001:4860::/32"} + + if !reflect.DeepEqual(got, want) { + t.Fatalf("unexpected reduced CIDRs: got %v want %v", got, want) + } +} + +func TestFetchGoogleCrawlerIPs(t *testing.T) { + log := slog.New(slog.NewTextHandler(os.Stdout, nil)) + + serverA := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"prefixes":[{"ipv4Prefix":"8.8.8.0/24"}]}`)) + })) + defer serverA.Close() + + serverB := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"prefixes":[{"ipv4Prefix":"8.8.8.0/25"},{"ipv6Prefix":"2001:4860::/32"}]}`)) + })) + defer serverB.Close() + + got, err := FetchGoogleCrawlerIPs(log, serverA.Client(), []string{serverA.URL, serverB.URL}) + if err != nil { + t.Fatalf("FetchGoogleCrawlerIPs failed: %v", err) + } + + want := []string{"8.8.8.0/24", "2001:4860::/32"} + if !reflect.DeepEqual(got, want) { + t.Fatalf("unexpected CIDRs: got %v want %v", got, want) + } +} + +func TestFetchGoogleCrawlerIPsError(t *testing.T) { + log := slog.New(slog.NewTextHandler(os.Stdout, nil)) + + okServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"prefixes":[{"ipv4Prefix":"8.8.8.0/24"}]}`)) + })) + defer okServer.Close() + + errServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + http.Error(w, "boom", http.StatusInternalServerError) + })) + defer errServer.Close() + + _, err := FetchGoogleCrawlerIPs(log, okServer.Client(), []string{okServer.URL, errServer.URL}) + if err == nil { + t.Fatal("expected error when one endpoint returns non-200") + } +} + +func TestRefreshGoogleCrawlerIPs(t *testing.T) { + log := slog.New(slog.NewTextHandler(os.Stdout, nil)) + g := NewGooglebotIPs() + + serverA := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"prefixes":[{"ipv4Prefix":"203.0.113.0/24"}]}`)) + })) + defer serverA.Close() + + serverB := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"prefixes":[{"ipv4Prefix":"203.0.113.0/25"},{"ipv6Prefix":"2001:db8::/32"}]}`)) + })) + defer serverB.Close() + + count, err := RefreshGoogleCrawlerIPs(log, serverA.Client(), g, []string{serverA.URL, serverB.URL}) + if err != nil { + t.Fatalf("RefreshGoogleCrawlerIPs failed: %v", err) + } + + if count != 2 { + t.Fatalf("expected reduced count 2, got %d", count) + } + + if !g.Contains(net.ParseIP("203.0.113.9")) { + t.Fatal("expected refreshed set to contain 203.0.113.9") + } + if !g.Contains(net.ParseIP("2001:db8::1")) { + t.Fatal("expected refreshed set to contain 2001:db8::1") + } +} + +func TestRefreshGoogleCrawlerIPsError(t *testing.T) { + log := slog.New(slog.NewTextHandler(os.Stdout, nil)) + g := NewGooglebotIPs() + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + http.Error(w, "boom", http.StatusInternalServerError) + })) + defer server.Close() + + _, err := RefreshGoogleCrawlerIPs(log, server.Client(), g, []string{server.URL}) + if err == nil { + t.Fatal("expected refresh error") + } +} diff --git a/main.go b/main.go index b586ff4..4ba76be 100644 --- a/main.go +++ b/main.go @@ -351,24 +351,22 @@ func (bc *CaptchaProtect) googlebotIPCheckLoop(ctx context.Context) { defer ticker.Stop() // Initial fetch - cidrs, err := helper.FetchGooglebotIPs(bc.log, bc.httpClient, "https://developers.google.com/static/search/apis/ipranges/googlebot.json") + count, err := helper.RefreshGoogleCrawlerIPs(bc.log, bc.httpClient, bc.googlebotIPs, helper.GoogleCrawlerIPRangeURLs) if err != nil { bc.log.Error("failed to fetch googlebot ips", "err", err) } else { - bc.googlebotIPs.Update(cidrs, bc.log) - bc.log.Info("Updated Googlebot IPs", "count", len(cidrs)) + bc.log.Info("Updated Googlebot IPs", "count", count) } for { select { case <-ticker.C: - cidrs, err := helper.FetchGooglebotIPs(bc.log, bc.httpClient, "https://developers.google.com/static/search/apis/ipranges/googlebot.json") + count, err := helper.RefreshGoogleCrawlerIPs(bc.log, bc.httpClient, bc.googlebotIPs, helper.GoogleCrawlerIPRangeURLs) if err != nil { bc.log.Error("failed to fetch googlebot ips", "err", err) continue } - bc.googlebotIPs.Update(cidrs, bc.log) - bc.log.Info("Updated Googlebot IPs", "count", len(cidrs)) + bc.log.Info("Updated Googlebot IPs", "count", count) case <-ctx.Done(): return } diff --git a/main_test.go b/main_test.go index 9ac2214..7aae468 100644 --- a/main_test.go +++ b/main_test.go @@ -14,6 +14,8 @@ import ( "strings" "testing" "time" + + "github.com/libops/captcha-protect/internal/helper" ) func TestParseIp(t *testing.T) { @@ -1629,3 +1631,221 @@ func TestPojChallengeGeneration(t *testing.T) { t.Errorf("Expected PoJ JS URL in challenge page") } } + +func TestPerformHealthCheckSuccessResetsFailures(t *testing.T) { + config := CreateConfig() + config.SiteKey = "test" + config.SecretKey = "test" + config.ProtectRoutes = []string{"/"} + config.CaptchaProvider = "turnstile" + config.PeriodSeconds = 3600 + config.FailureThreshold = 2 + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + bc, err := NewCaptchaProtect(ctx, nil, config, "test") + if err != nil { + t.Fatalf("Failed to create CaptchaProtect: %v", err) + } + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusOK) + })) + defer server.Close() + + bc.captchaConfig.js = server.URL + bc.recordHealthCheckFailure() + + bc.performHealthCheck() + + bc.mu.RLock() + defer bc.mu.RUnlock() + if bc.healthCheckFailureCount != 0 { + t.Fatalf("expected failure count reset to 0, got %d", bc.healthCheckFailureCount) + } + if bc.circuitState != circuitClosed { + t.Fatalf("expected circuit to be closed, got %v", bc.circuitState) + } +} + +func TestPerformHealthCheckFailurePaths(t *testing.T) { + tests := []struct { + name string + jsURL string + status int + expectErr bool + }{ + { + name: "404 considered failure", + status: http.StatusNotFound, + }, + { + name: "503 considered failure", + status: http.StatusServiceUnavailable, + }, + { + name: "invalid URL request creation failure", + jsURL: "://invalid-url", + expectErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + config := CreateConfig() + config.SiteKey = "test" + config.SecretKey = "test" + config.ProtectRoutes = []string{"/"} + config.CaptchaProvider = "turnstile" + config.PeriodSeconds = 3600 + config.FailureThreshold = 1 + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + bc, err := NewCaptchaProtect(ctx, nil, config, "test") + if err != nil { + t.Fatalf("Failed to create CaptchaProtect: %v", err) + } + + if tt.expectErr { + bc.captchaConfig.js = tt.jsURL + } else { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(tt.status) + })) + defer server.Close() + bc.captchaConfig.js = server.URL + } + + bc.performHealthCheck() + + bc.mu.RLock() + defer bc.mu.RUnlock() + if bc.healthCheckFailureCount != 1 { + t.Fatalf("expected failure count 1, got %d", bc.healthCheckFailureCount) + } + if bc.circuitState != circuitOpen { + t.Fatalf("expected circuit to open, got %v", bc.circuitState) + } + }) + } +} + +func TestVerifyChallengePagePojFallbackUsesOneHourTTL(t *testing.T) { + config := CreateConfig() + config.SiteKey = "test-key" + config.SecretKey = "test-secret" + config.ProtectRoutes = []string{"/"} + config.CaptchaProvider = "turnstile" + config.PeriodSeconds = 3600 + config.FailureThreshold = 1 + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + bc, err := NewCaptchaProtect(ctx, nil, config, "test") + if err != nil { + t.Fatalf("Failed to create CaptchaProtect: %v", err) + } + + // Open the circuit so PoJ becomes active fallback provider. + bc.recordHealthCheckFailure() + + form := url.Values{} + form.Add("poj-captcha-response", "ok") + form.Add("destination", "%2F") + req := httptest.NewRequest(http.MethodPost, "/challenge", strings.NewReader(form.Encode())) + req.Header.Set("Content-Type", "application/x-www-form-urlencoded") + rr := httptest.NewRecorder() + clientIP := "203.0.113.10" + + status := bc.verifyChallengePage(rr, req, clientIP) + if status != http.StatusFound { + t.Fatalf("expected status %d, got %d", http.StatusFound, status) + } + + item, found := bc.verifiedCache.Items()[clientIP] + if !found { + t.Fatalf("expected %s to be in verified cache", clientIP) + } + + remaining := time.Until(time.Unix(0, item.Expiration)) + if remaining < 50*time.Minute || remaining > 70*time.Minute { + t.Fatalf("expected PoJ fallback TTL around 1h, got %s", remaining) + } +} + +func TestGooglebotIPCheckLoopInitialFetchSuccess(t *testing.T) { + originalURLs := helper.GoogleCrawlerIPRangeURLs + defer func() { helper.GoogleCrawlerIPRangeURLs = originalURLs }() + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"prefixes":[{"ipv4Prefix":"203.0.113.0/24"}]}`)) + })) + defer server.Close() + + helper.GoogleCrawlerIPRangeURLs = []string{server.URL} + + bc := &CaptchaProtect{ + log: slog.New(slog.NewTextHandler(os.Stdout, nil)), + httpClient: server.Client(), + googlebotIPs: helper.NewGooglebotIPs(), + } + + ctx, cancel := context.WithCancel(context.Background()) + done := make(chan struct{}) + go func() { + bc.googlebotIPCheckLoop(ctx) + close(done) + }() + + deadline := time.Now().Add(2 * time.Second) + for time.Now().Before(deadline) { + if bc.googlebotIPs.Contains(net.ParseIP("203.0.113.10")) { + cancel() + <-done + return + } + time.Sleep(20 * time.Millisecond) + } + + cancel() + <-done + t.Fatal("expected googlebot IPs to be updated from initial crawler fetch") +} + +func TestGooglebotIPCheckLoopInitialFetchError(t *testing.T) { + originalURLs := helper.GoogleCrawlerIPRangeURLs + defer func() { helper.GoogleCrawlerIPRangeURLs = originalURLs }() + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + http.Error(w, "boom", http.StatusInternalServerError) + })) + defer server.Close() + + helper.GoogleCrawlerIPRangeURLs = []string{server.URL} + + bc := &CaptchaProtect{ + log: slog.New(slog.NewTextHandler(os.Stdout, nil)), + httpClient: server.Client(), + googlebotIPs: helper.NewGooglebotIPs(), + } + + ctx, cancel := context.WithCancel(context.Background()) + done := make(chan struct{}) + go func() { + bc.googlebotIPCheckLoop(ctx) + close(done) + }() + + time.Sleep(100 * time.Millisecond) + cancel() + <-done + + if bc.googlebotIPs.Contains(net.ParseIP("203.0.113.10")) { + t.Fatal("did not expect googlebot IPs to update when initial fetch fails") + } +}