From 3c15fa9d7f5e68387c5a22f3ed7c7289d9f23e90 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 28 Jan 2026 16:52:16 +0000 Subject: [PATCH 01/10] Initial plan From 5c2f321f3c2230ffbf88d7fb258f76811cabe3f7 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 28 Jan 2026 16:57:09 +0000 Subject: [PATCH 02/10] Implement Google Scholar rate limiting with configurable limits Co-authored-by: boonebgorges <246627+boonebgorges@users.noreply.github.com> --- Core/Models/Feed.php | 13 +- Core/Utility/GoogleScholarRateLimiter.php | 267 ++++++++++++++++++++++ Core/Utility/Retrieval.php | 27 +++ modules/google-scholar/google-scholar.php | 30 ++- 4 files changed, 333 insertions(+), 4 deletions(-) create mode 100644 Core/Utility/GoogleScholarRateLimiter.php diff --git a/Core/Models/Feed.php b/Core/Models/Feed.php index 00454dcd..fc93706c 100644 --- a/Core/Models/Feed.php +++ b/Core/Models/Feed.php @@ -206,10 +206,19 @@ public function get_next_scheduled_retrieval_string() { * @return true|\WP_Error True if scheduled, WP_Error if not. See wp_schedule_event(). */ public function schedule_retrieval( $args = [] ) { + // Determine if this is a Google Scholar feed. + $feeds_schema = pressforward( 'schema.feeds' ); + $feed_type = $feeds_schema->get_pf_feed_type( $this->get( 'id' ) ); + $is_google_scholar = in_array( $feed_type, [ 'google-scholar', 'google-scholar-keyword', 'google-scholar-author' ], true ); + + // Set default interval and nextrun based on feed type. + $default_interval = $is_google_scholar ? 'pf_google_scholar_interval' : 'pf_interval'; + $default_nextrun = $is_google_scholar ? time() + ( wp_rand( 0, 120 ) * MINUTE_IN_SECONDS ) : time() + ( wp_rand( 0, 30 ) * MINUTE_IN_SECONDS ); + $r = array_merge( [ - 'interval' => 'pf_interval', - 'nextrun' => time() + ( wp_rand( 0, 30 ) * MINUTE_IN_SECONDS ), + 'interval' => $default_interval, + 'nextrun' => $default_nextrun, ], $args ); diff --git a/Core/Utility/GoogleScholarRateLimiter.php b/Core/Utility/GoogleScholarRateLimiter.php new file mode 100644 index 00000000..0cfb4553 --- /dev/null +++ b/Core/Utility/GoogleScholarRateLimiter.php @@ -0,0 +1,267 @@ + $cutoff_time; + } + ); + } + + /** + * Check if a request is allowed based on rate limits. + * + * @return bool|array True if allowed, array with error details if not. + */ + public static function is_request_allowed() { + $timestamps = self::get_request_timestamps(); + $timestamps = self::cleanup_old_timestamps( $timestamps ); + + $now = time(); + $one_hour_ago = $now - HOUR_IN_SECONDS; + $one_day_ago = $now - DAY_IN_SECONDS; + $max_per_hour = self::get_max_requests_per_hour(); + $max_per_day = self::get_max_requests_per_day(); + + // Count requests in the last hour. + $requests_last_hour = count( + array_filter( + $timestamps, + function ( $timestamp ) use ( $one_hour_ago ) { + return $timestamp > $one_hour_ago; + } + ) + ); + + // Count requests in the last day. + $requests_last_day = count( + array_filter( + $timestamps, + function ( $timestamp ) use ( $one_day_ago ) { + return $timestamp > $one_day_ago; + } + ) + ); + + // Check hourly limit. + if ( $requests_last_hour >= $max_per_hour ) { + return [ + 'allowed' => false, + 'reason' => 'hourly_limit', + 'message' => sprintf( + // translators: 1: number of requests, 2: time period. + __( 'Google Scholar request limit reached: %1$d requests per %2$s. Please try again later.', 'pressforward' ), + $max_per_hour, + __( 'hour', 'pressforward' ) + ), + 'requests_count' => $requests_last_hour, + 'limit' => $max_per_hour, + 'retry_after' => self::get_retry_after_seconds( $timestamps, HOUR_IN_SECONDS, $max_per_hour ), + ]; + } + + // Check daily limit. + if ( $requests_last_day >= $max_per_day ) { + return [ + 'allowed' => false, + 'reason' => 'daily_limit', + 'message' => sprintf( + // translators: 1: number of requests, 2: time period. + __( 'Google Scholar request limit reached: %1$d requests per %2$s. Please try again later.', 'pressforward' ), + $max_per_day, + __( 'day', 'pressforward' ) + ), + 'requests_count' => $requests_last_day, + 'limit' => $max_per_day, + 'retry_after' => self::get_retry_after_seconds( $timestamps, DAY_IN_SECONDS, $max_per_day ), + ]; + } + + return true; + } + + /** + * Get the number of seconds until the next request can be made. + * + * @param array $timestamps Array of Unix timestamps. + * @param int $period Time period in seconds. + * @param int $limit Maximum requests in the period. + * @return int Seconds until retry is allowed. + */ + protected static function get_retry_after_seconds( $timestamps, $period, $limit ) { + $now = time(); + $cutoff_time = $now - $period; + + $recent_timestamps = array_filter( + $timestamps, + function ( $timestamp ) use ( $cutoff_time ) { + return $timestamp > $cutoff_time; + } + ); + + if ( count( $recent_timestamps ) < $limit ) { + return 0; + } + + // Sort timestamps in ascending order. + sort( $recent_timestamps ); + + // The oldest timestamp in the period. + $oldest_timestamp = reset( $recent_timestamps ); + + // Calculate when that timestamp will fall outside the period. + $retry_after = ( $oldest_timestamp + $period ) - $now; + + return max( 0, $retry_after ); + } + + /** + * Record a request timestamp. + * + * @return bool True on success, false on failure. + */ + public static function record_request() { + $timestamps = self::get_request_timestamps(); + $timestamps = self::cleanup_old_timestamps( $timestamps ); + $timestamps[] = time(); + + return self::save_request_timestamps( $timestamps ); + } + + /** + * Get current rate limit status information. + * + * @return array Status information. + */ + public static function get_status() { + $timestamps = self::get_request_timestamps(); + $timestamps = self::cleanup_old_timestamps( $timestamps ); + + $now = time(); + $one_hour_ago = $now - HOUR_IN_SECONDS; + $one_day_ago = $now - DAY_IN_SECONDS; + $max_per_hour = self::get_max_requests_per_hour(); + $max_per_day = self::get_max_requests_per_day(); + + $requests_last_hour = count( + array_filter( + $timestamps, + function ( $timestamp ) use ( $one_hour_ago ) { + return $timestamp > $one_hour_ago; + } + ) + ); + + $requests_last_day = count( + array_filter( + $timestamps, + function ( $timestamp ) use ( $one_day_ago ) { + return $timestamp > $one_day_ago; + } + ) + ); + + return [ + 'requests_last_hour' => $requests_last_hour, + 'max_requests_per_hour' => $max_per_hour, + 'requests_last_day' => $requests_last_day, + 'max_requests_per_day' => $max_per_day, + 'hourly_remaining' => max( 0, $max_per_hour - $requests_last_hour ), + 'daily_remaining' => max( 0, $max_per_day - $requests_last_day ), + ]; + } + + /** + * Reset all stored request timestamps. + * + * @return bool True on success, false on failure. + */ + public static function reset() { + return delete_option( self::OPTION_NAME ); + } +} diff --git a/Core/Utility/Retrieval.php b/Core/Utility/Retrieval.php index 9f89bcaa..e08707f4 100644 --- a/Core/Utility/Retrieval.php +++ b/Core/Utility/Retrieval.php @@ -8,6 +8,7 @@ namespace PressForward\Core\Utility; use PressForward\Core\Models\Feed; +use PressForward\Core\Utility\GoogleScholarRateLimiter; /** * Feed 'slurping' class. @@ -57,6 +58,13 @@ public function cron_add_short( $schedules ) { 'interval' => $pf_interval * 60, 'display' => __( 'PressForward Retrieval Interval', 'pressforward' ), ); + + // Add daily schedule for Google Scholar feeds. + $schedules['pf_google_scholar_interval'] = array( + 'interval' => DAY_IN_SECONDS, + 'display' => __( 'PressForward Google Scholar Retrieval Interval (Daily)', 'pressforward' ), + ); + return $schedules; } @@ -198,6 +206,25 @@ public function ajax_update_feed_handler() { return; } + // Check if this is a Google Scholar feed and if rate limited. + $feeds_schema = pressforward( 'schema.feeds' ); + $feed_type = $feeds_schema->get_pf_feed_type( $post_id ); + $is_google_scholar = in_array( $feed_type, [ 'google-scholar', 'google-scholar-keyword', 'google-scholar-author' ], true ); + + if ( $is_google_scholar ) { + $rate_limit_check = GoogleScholarRateLimiter::is_request_allowed(); + if ( true !== $rate_limit_check ) { + pf_log( 'Manual Google Scholar refresh rate limit reached: ' . $rate_limit_check['message'] ); + wp_send_json_error( + [ + 'message' => $rate_limit_check['message'], + 'retry_after' => $rate_limit_check['retry_after'], + ] + ); + return; + } + } + $retrieved_status = $feed->retrieve(); $retval = [ diff --git a/modules/google-scholar/google-scholar.php b/modules/google-scholar/google-scholar.php index 229fac88..ddc777b0 100644 --- a/modules/google-scholar/google-scholar.php +++ b/modules/google-scholar/google-scholar.php @@ -8,6 +8,7 @@ use PressForward\Interfaces\FeedSource; use PressForward\Core\DTO\FeedItem; +use PressForward\Core\Utility\GoogleScholarRateLimiter; /** * PF_Google_Scholar class. @@ -28,6 +29,17 @@ public function __construct() { * @return array|\WP_Error */ public function fetch( $feed ) { + // Check rate limit before making request. + $rate_limit_check = GoogleScholarRateLimiter::is_request_allowed(); + if ( true !== $rate_limit_check ) { + pf_log( 'Google Scholar rate limit reached: ' . $rate_limit_check['message'] ); + return [ + 'success' => false, + 'message' => $rate_limit_check['message'], + 'entries' => [], + ]; + } + $url = $feed->get( 'remote_feed_url' ); $is_profile = false !== strpos( $url, 'user=' ); @@ -42,7 +54,7 @@ public function fetch( $feed ) { $url, [ 'timeout' => 30, - 'user-agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36', + 'user-agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', ] ); @@ -80,6 +92,9 @@ public function fetch( $feed ) { $entries = $this->parse_items_from_search( $xpath, $feed ); } + // Record successful request. + GoogleScholarRateLimiter::record_request(); + return $entries; } @@ -263,6 +278,14 @@ protected function parse_items_from_search( $xpath, $feed ) { * @param bool $is_new_feed Whether the feed is new. */ public function health_check( \PressForward\Core\Models\Feed $feed, $is_new_feed = false ) { + // Check rate limit before making request. + $rate_limit_check = GoogleScholarRateLimiter::is_request_allowed(); + if ( true !== $rate_limit_check ) { + pf_log( 'Google Scholar health check rate limit reached: ' . $rate_limit_check['message'] ); + // For health checks, we'll skip rather than fail when rate limited. + return; + } + $feed_url = $feed->get( 'remote_feed_url' ); $feed_is_valid = false; @@ -272,7 +295,7 @@ public function health_check( \PressForward\Core\Models\Feed $feed, $is_new_feed $feed_url, [ 'timeout' => 30, - 'user-agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36', + 'user-agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', ] ); @@ -302,6 +325,9 @@ public function health_check( \PressForward\Core\Models\Feed $feed, $is_new_feed $alert_box->dismiss_alert( $feed->get( 'id' ) ); } + // Record successful request. + GoogleScholarRateLimiter::record_request(); + if ( $is_new_feed ) { // Get the feed title from search box HTML, class 'gs_in_txt'. $doc = new DOMDocument(); From c3b68508fbce0e5557e0edd0b58b9a947a3e5b67 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 28 Jan 2026 17:01:27 +0000 Subject: [PATCH 03/10] Add tests for Google Scholar rate limiter Co-authored-by: boonebgorges <246627+boonebgorges@users.noreply.github.com> --- tests/test-google-scholar-rate-limiter.php | 141 +++++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 tests/test-google-scholar-rate-limiter.php diff --git a/tests/test-google-scholar-rate-limiter.php b/tests/test-google-scholar-rate-limiter.php new file mode 100644 index 00000000..a3d603d7 --- /dev/null +++ b/tests/test-google-scholar-rate-limiter.php @@ -0,0 +1,141 @@ +assertTrue( $result ); + } + + /** + * Test that requests are blocked after hourly limit is reached. + */ + public function test_hourly_limit_is_enforced() { + $max_per_hour = \PressForward\Core\Utility\GoogleScholarRateLimiter::get_max_requests_per_hour(); + + // Make max requests. + for ( $i = 0; $i < $max_per_hour; $i++ ) { + $this->assertTrue( \PressForward\Core\Utility\GoogleScholarRateLimiter::is_request_allowed() ); + \PressForward\Core\Utility\GoogleScholarRateLimiter::record_request(); + } + + // Next request should be blocked. + $result = \PressForward\Core\Utility\GoogleScholarRateLimiter::is_request_allowed(); + $this->assertIsArray( $result ); + $this->assertFalse( $result['allowed'] ); + $this->assertEquals( 'hourly_limit', $result['reason'] ); + } + + /** + * Test that requests are blocked after daily limit is reached. + */ + public function test_daily_limit_is_enforced() { + // Set a low hourly limit to test daily limit. + update_option( 'pf_google_scholar_max_per_hour', 100 ); + $max_per_day = \PressForward\Core\Utility\GoogleScholarRateLimiter::get_max_requests_per_day(); + + // Make max requests. + for ( $i = 0; $i < $max_per_day; $i++ ) { + \PressForward\Core\Utility\GoogleScholarRateLimiter::record_request(); + } + + // Next request should be blocked by daily limit. + $result = \PressForward\Core\Utility\GoogleScholarRateLimiter::is_request_allowed(); + $this->assertIsArray( $result ); + $this->assertFalse( $result['allowed'] ); + $this->assertEquals( 'daily_limit', $result['reason'] ); + } + + /** + * Test that get_status returns correct information. + */ + public function test_get_status_returns_correct_info() { + // Make some requests. + \PressForward\Core\Utility\GoogleScholarRateLimiter::record_request(); + \PressForward\Core\Utility\GoogleScholarRateLimiter::record_request(); + + $status = \PressForward\Core\Utility\GoogleScholarRateLimiter::get_status(); + + $this->assertIsArray( $status ); + $this->assertEquals( 2, $status['requests_last_hour'] ); + $this->assertEquals( 2, $status['requests_last_day'] ); + $this->assertGreaterThan( 0, $status['max_requests_per_hour'] ); + $this->assertGreaterThan( 0, $status['max_requests_per_day'] ); + } + + /** + * Test that reset clears all timestamps. + */ + public function test_reset_clears_timestamps() { + // Make some requests. + \PressForward\Core\Utility\GoogleScholarRateLimiter::record_request(); + \PressForward\Core\Utility\GoogleScholarRateLimiter::record_request(); + + $status = \PressForward\Core\Utility\GoogleScholarRateLimiter::get_status(); + $this->assertEquals( 2, $status['requests_last_hour'] ); + + // Reset. + \PressForward\Core\Utility\GoogleScholarRateLimiter::reset(); + + $status = \PressForward\Core\Utility\GoogleScholarRateLimiter::get_status(); + $this->assertEquals( 0, $status['requests_last_hour'] ); + $this->assertEquals( 0, $status['requests_last_day'] ); + } + + /** + * Test that custom limits can be configured. + */ + public function test_custom_limits_can_be_configured() { + update_option( 'pf_google_scholar_max_per_hour', 5 ); + update_option( 'pf_google_scholar_max_per_day', 20 ); + + $this->assertEquals( 5, \PressForward\Core\Utility\GoogleScholarRateLimiter::get_max_requests_per_hour() ); + $this->assertEquals( 20, \PressForward\Core\Utility\GoogleScholarRateLimiter::get_max_requests_per_day() ); + } + + /** + * Test that retry_after is calculated correctly. + */ + public function test_retry_after_is_calculated() { + $max_per_hour = \PressForward\Core\Utility\GoogleScholarRateLimiter::get_max_requests_per_hour(); + + // Make max requests. + for ( $i = 0; $i < $max_per_hour; $i++ ) { + \PressForward\Core\Utility\GoogleScholarRateLimiter::record_request(); + } + + // Next request should be blocked with retry_after value. + $result = \PressForward\Core\Utility\GoogleScholarRateLimiter::is_request_allowed(); + $this->assertIsArray( $result ); + $this->assertArrayHasKey( 'retry_after', $result ); + $this->assertGreaterThan( 0, $result['retry_after'] ); + $this->assertLessThanOrEqual( HOUR_IN_SECONDS, $result['retry_after'] ); + } +} From 2dddb0a91e04c2c37ac27517c2b824be80549e44 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 28 Jan 2026 17:16:33 +0000 Subject: [PATCH 04/10] Add Google rate-limit detection in validate_feed and fetch methods Co-authored-by: boonebgorges <246627+boonebgorges@users.noreply.github.com> --- Core/Schema/Feeds.php | 57 +++++++++++++++++++++- modules/google-scholar/google-scholar.php | 59 +++++++++++++++++++++++ 2 files changed, 115 insertions(+), 1 deletion(-) diff --git a/Core/Schema/Feeds.php b/Core/Schema/Feeds.php index 5508a26b..ddbe6687 100644 --- a/Core/Schema/Feeds.php +++ b/Core/Schema/Feeds.php @@ -12,6 +12,7 @@ use Intraxia\Jaxion\Contract\Core\HasFilters; use PressForward\Controllers\Metas; use PressForward\Core\Models\Feed; +use PressForward\Core\Utility\GoogleScholarRateLimiter; /** * Database class for manipulating feed. @@ -817,12 +818,31 @@ public static function validate_feed( $url ) { switch ( $feed_type ) { case 'google-scholar-author': case 'google-scholar-keyword': - $request = wp_remote_get( $url ); + // Check rate limit before making request. + $rate_limit_check = GoogleScholarRateLimiter::is_request_allowed(); + if ( true !== $rate_limit_check ) { + $retval['message'] = $rate_limit_check['message']; + break; + } + + $request = wp_remote_get( + $url, + [ + 'user-agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + ] + ); + if ( is_wp_error( $request ) ) { $retval['message'] = $request->get_error_message(); } elseif ( 200 !== wp_remote_retrieve_response_code( $request ) ) { $retval['message'] = __( 'The URL returned an error.', 'pressforward' ); + } elseif ( self::is_google_rate_limited_response( $request ) ) { + // Detected Google's rate limiting page. + $retval['message'] = __( 'Google Scholar is rate limiting requests from your server. Please try again later.', 'pressforward' ); } else { + // Record successful request only if not rate limited. + GoogleScholarRateLimiter::record_request(); + $retval['success'] = true; $retval['feedUrl'] = $url; $retval['message'] = 'google-scholar-author' === $feed_type ? __( 'Google Scholar author feed detected.', 'pressforward' ) : __( 'Google Scholar keyword feed detected.', 'pressforward' ); @@ -843,6 +863,41 @@ public static function validate_feed( $url ) { return $retval; } + /** + * Detects if a response indicates Google rate limiting. + * + * Google redirects to a CAPTCHA page at /sorry/index when rate limiting. + * This can be a 302 redirect or a 200 response with the sorry page content. + * + * @param array|\WP_Error $response HTTP response from wp_remote_get(). + * @return bool True if rate limiting is detected, false otherwise. + */ + protected static function is_google_rate_limited_response( $response ) { + if ( is_wp_error( $response ) ) { + return false; + } + + // Check for redirect to sorry page. + $redirect_url = wp_remote_retrieve_header( $response, 'location' ); + if ( $redirect_url && false !== strpos( $redirect_url, '/sorry/' ) ) { + return true; + } + + // Check the body for signs of the sorry page. + $body = wp_remote_retrieve_body( $response ); + if ( $body ) { + // Check for common indicators of Google's rate limiting page. + if ( false !== strpos( $body, '/sorry/' ) || + false !== strpos( $body, 'automated queries' ) || + false !== strpos( $body, 'unusual traffic' ) || + false !== strpos( $body, 'recaptcha' ) ) { + return true; + } + } + + return false; + } + /** * Detects a feed type based on the URL format. * diff --git a/modules/google-scholar/google-scholar.php b/modules/google-scholar/google-scholar.php index ddc777b0..da42439a 100644 --- a/modules/google-scholar/google-scholar.php +++ b/modules/google-scholar/google-scholar.php @@ -22,6 +22,47 @@ public function __construct() { parent::start(); } + /** + * Detects if a response indicates Google rate limiting. + * + * Google redirects to a CAPTCHA page at /sorry/index when rate limiting. + * This can be a 302 redirect or a 200 response with the sorry page content. + * + * @param array|WP_Error $response HTTP response from wp_remote_get(). + * @return bool True if rate limiting is detected, false otherwise. + */ + protected function is_rate_limited_response( $response ) { + if ( is_wp_error( $response ) ) { + return false; + } + + // Check for redirect to sorry page. + $redirect_url = wp_remote_retrieve_header( $response, 'location' ); + if ( $redirect_url && false !== strpos( $redirect_url, '/sorry/' ) ) { + return true; + } + + // Check the final URL after redirects. + $final_url = wp_remote_retrieve_header( $response, 'x-final-url' ); + if ( ! $final_url ) { + // If no x-final-url header, check the body for signs of the sorry page. + $body = wp_remote_retrieve_body( $response ); + if ( $body ) { + // Check for common indicators of Google's rate limiting page. + if ( false !== strpos( $body, '/sorry/' ) || + false !== strpos( $body, 'automated queries' ) || + false !== strpos( $body, 'unusual traffic' ) || + false !== strpos( $body, 'recaptcha' ) ) { + return true; + } + } + } elseif ( false !== strpos( $final_url, '/sorry/' ) ) { + return true; + } + + return false; + } + /** * Fetches data from URL. * @@ -67,6 +108,16 @@ public function fetch( $feed ) { ]; } + // Check if the response indicates rate limiting by Google. + if ( $this->is_rate_limited_response( $response ) ) { + pf_log( 'Google Scholar response indicates rate limiting (CAPTCHA/sorry page)' ); + return [ + 'success' => false, + 'message' => __( 'Google Scholar is rate limiting this request. Please try again later.', 'pressforward' ), + 'entries' => [], + ]; + } + // Get the body content. $html = wp_remote_retrieve_body( $response ); @@ -301,6 +352,14 @@ public function health_check( \PressForward\Core\Models\Feed $feed, $is_new_feed $body = ''; if ( ! is_wp_error( $response ) ) { + // Check if the response indicates rate limiting by Google. + if ( $this->is_rate_limited_response( $response ) ) { + pf_log( 'Google Scholar health check response indicates rate limiting (CAPTCHA/sorry page)' ); + // Don't record this as a successful request. + // Don't mark the feed as invalid, just skip health check. + return; + } + $body = wp_remote_retrieve_body( $response ); // Check if the body contains Google Scholar specific content. From 95a8b18e57b2bf402abab0161761bb2f0327ccd2 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 28 Jan 2026 17:21:38 +0000 Subject: [PATCH 05/10] Refactor rate-limit detection: extract USER_AGENT constant, make method static, use case-insensitive checks Co-authored-by: boonebgorges <246627+boonebgorges@users.noreply.github.com> --- Core/Schema/Feeds.php | 39 +------------------- modules/google-scholar/google-scholar.php | 45 ++++++++++++----------- 2 files changed, 25 insertions(+), 59 deletions(-) diff --git a/Core/Schema/Feeds.php b/Core/Schema/Feeds.php index ddbe6687..fb20c272 100644 --- a/Core/Schema/Feeds.php +++ b/Core/Schema/Feeds.php @@ -828,7 +828,7 @@ public static function validate_feed( $url ) { $request = wp_remote_get( $url, [ - 'user-agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'user-agent' => PF_Google_Scholar::USER_AGENT, ] ); @@ -836,7 +836,7 @@ public static function validate_feed( $url ) { $retval['message'] = $request->get_error_message(); } elseif ( 200 !== wp_remote_retrieve_response_code( $request ) ) { $retval['message'] = __( 'The URL returned an error.', 'pressforward' ); - } elseif ( self::is_google_rate_limited_response( $request ) ) { + } elseif ( PF_Google_Scholar::is_rate_limited_response( $request ) ) { // Detected Google's rate limiting page. $retval['message'] = __( 'Google Scholar is rate limiting requests from your server. Please try again later.', 'pressforward' ); } else { @@ -863,41 +863,6 @@ public static function validate_feed( $url ) { return $retval; } - /** - * Detects if a response indicates Google rate limiting. - * - * Google redirects to a CAPTCHA page at /sorry/index when rate limiting. - * This can be a 302 redirect or a 200 response with the sorry page content. - * - * @param array|\WP_Error $response HTTP response from wp_remote_get(). - * @return bool True if rate limiting is detected, false otherwise. - */ - protected static function is_google_rate_limited_response( $response ) { - if ( is_wp_error( $response ) ) { - return false; - } - - // Check for redirect to sorry page. - $redirect_url = wp_remote_retrieve_header( $response, 'location' ); - if ( $redirect_url && false !== strpos( $redirect_url, '/sorry/' ) ) { - return true; - } - - // Check the body for signs of the sorry page. - $body = wp_remote_retrieve_body( $response ); - if ( $body ) { - // Check for common indicators of Google's rate limiting page. - if ( false !== strpos( $body, '/sorry/' ) || - false !== strpos( $body, 'automated queries' ) || - false !== strpos( $body, 'unusual traffic' ) || - false !== strpos( $body, 'recaptcha' ) ) { - return true; - } - } - - return false; - } - /** * Detects a feed type based on the URL format. * diff --git a/modules/google-scholar/google-scholar.php b/modules/google-scholar/google-scholar.php index da42439a..6b414fb4 100644 --- a/modules/google-scholar/google-scholar.php +++ b/modules/google-scholar/google-scholar.php @@ -14,6 +14,13 @@ * PF_Google_Scholar class. */ class PF_Google_Scholar extends PF_Module implements FeedSource { + /** + * User-Agent string to use for Google Scholar requests. + * + * @var string + */ + const USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'; + /** * Constructor. */ @@ -28,36 +35,30 @@ public function __construct() { * Google redirects to a CAPTCHA page at /sorry/index when rate limiting. * This can be a 302 redirect or a 200 response with the sorry page content. * - * @param array|WP_Error $response HTTP response from wp_remote_get(). + * @param array|\WP_Error $response HTTP response from wp_remote_get(). * @return bool True if rate limiting is detected, false otherwise. */ - protected function is_rate_limited_response( $response ) { + public static function is_rate_limited_response( $response ) { if ( is_wp_error( $response ) ) { return false; } // Check for redirect to sorry page. $redirect_url = wp_remote_retrieve_header( $response, 'location' ); - if ( $redirect_url && false !== strpos( $redirect_url, '/sorry/' ) ) { + if ( $redirect_url && false !== stripos( $redirect_url, '/sorry/' ) ) { return true; } - // Check the final URL after redirects. - $final_url = wp_remote_retrieve_header( $response, 'x-final-url' ); - if ( ! $final_url ) { - // If no x-final-url header, check the body for signs of the sorry page. - $body = wp_remote_retrieve_body( $response ); - if ( $body ) { - // Check for common indicators of Google's rate limiting page. - if ( false !== strpos( $body, '/sorry/' ) || - false !== strpos( $body, 'automated queries' ) || - false !== strpos( $body, 'unusual traffic' ) || - false !== strpos( $body, 'recaptcha' ) ) { - return true; - } + // Check the body for signs of the sorry page (case-insensitive). + $body = wp_remote_retrieve_body( $response ); + if ( $body ) { + // Check for common indicators of Google's rate limiting page. + if ( false !== stripos( $body, '/sorry/' ) || + false !== stripos( $body, 'automated queries' ) || + false !== stripos( $body, 'unusual traffic' ) || + false !== stripos( $body, 'recaptcha' ) ) { + return true; } - } elseif ( false !== strpos( $final_url, '/sorry/' ) ) { - return true; } return false; @@ -95,7 +96,7 @@ public function fetch( $feed ) { $url, [ 'timeout' => 30, - 'user-agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'user-agent' => self::USER_AGENT, ] ); @@ -109,7 +110,7 @@ public function fetch( $feed ) { } // Check if the response indicates rate limiting by Google. - if ( $this->is_rate_limited_response( $response ) ) { + if ( self::is_rate_limited_response( $response ) ) { pf_log( 'Google Scholar response indicates rate limiting (CAPTCHA/sorry page)' ); return [ 'success' => false, @@ -346,14 +347,14 @@ public function health_check( \PressForward\Core\Models\Feed $feed, $is_new_feed $feed_url, [ 'timeout' => 30, - 'user-agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'user-agent' => self::USER_AGENT, ] ); $body = ''; if ( ! is_wp_error( $response ) ) { // Check if the response indicates rate limiting by Google. - if ( $this->is_rate_limited_response( $response ) ) { + if ( self::is_rate_limited_response( $response ) ) { pf_log( 'Google Scholar health check response indicates rate limiting (CAPTCHA/sorry page)' ); // Don't record this as a successful request. // Don't mark the feed as invalid, just skip health check. From 909c667610874dbb0ef2b69bca3c993498903745 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 28 Jan 2026 17:36:31 +0000 Subject: [PATCH 06/10] Fix validate_feed error handling: reorder checks, handle 429 specifically, preserve error messages Co-authored-by: boonebgorges <246627+boonebgorges@users.noreply.github.com> --- Core/Schema/Feeds.php | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/Core/Schema/Feeds.php b/Core/Schema/Feeds.php index fb20c272..98de12dc 100644 --- a/Core/Schema/Feeds.php +++ b/Core/Schema/Feeds.php @@ -777,6 +777,11 @@ public function validate_feed_cb() { $retval['feedUrl'] = $validated['feedUrl']; wp_send_json_success( $retval ); + } else { + // Validation failed - use the message from validate_feed(). + if ( ! empty( $validated['message'] ) ) { + $retval['message'] = $validated['message']; + } } wp_send_json_error( $retval ); @@ -834,11 +839,20 @@ public static function validate_feed( $url ) { if ( is_wp_error( $request ) ) { $retval['message'] = $request->get_error_message(); - } elseif ( 200 !== wp_remote_retrieve_response_code( $request ) ) { - $retval['message'] = __( 'The URL returned an error.', 'pressforward' ); } elseif ( PF_Google_Scholar::is_rate_limited_response( $request ) ) { // Detected Google's rate limiting page. $retval['message'] = __( 'Google Scholar is rate limiting requests from your server. Please try again later.', 'pressforward' ); + } elseif ( 200 !== wp_remote_retrieve_response_code( $request ) ) { + $response_code = wp_remote_retrieve_response_code( $request ); + if ( 429 === $response_code ) { + $retval['message'] = __( 'Google Scholar is rate limiting requests. Please try again later.', 'pressforward' ); + } else { + $retval['message'] = sprintf( + // translators: %d is the HTTP response code. + __( 'The URL returned an error (HTTP %d).', 'pressforward' ), + $response_code + ); + } } else { // Record successful request only if not rate limited. GoogleScholarRateLimiter::record_request(); From 14961be715b9f09681cf073233028c488629d89f Mon Sep 17 00:00:00 2001 From: Boone B Gorges Date: Wed, 28 Jan 2026 12:40:08 -0600 Subject: [PATCH 07/10] Ensure proper namespacing in Feeds.php. --- Core/Schema/Feeds.php | 1 + 1 file changed, 1 insertion(+) diff --git a/Core/Schema/Feeds.php b/Core/Schema/Feeds.php index 98de12dc..b980a4b7 100644 --- a/Core/Schema/Feeds.php +++ b/Core/Schema/Feeds.php @@ -13,6 +13,7 @@ use PressForward\Controllers\Metas; use PressForward\Core\Models\Feed; use PressForward\Core\Utility\GoogleScholarRateLimiter; +use PF_Google_Scholar; /** * Database class for manipulating feed. From 79b047d935091bc0216df1203d25858e52fc28da Mon Sep 17 00:00:00 2001 From: Boone B Gorges Date: Wed, 28 Jan 2026 12:43:06 -0600 Subject: [PATCH 08/10] Code formatting. --- Core/Models/Feed.php | 4 +-- Core/Utility/GoogleScholarRateLimiter.php | 44 +++++++++++------------ Core/Utility/Retrieval.php | 4 +-- modules/google-scholar/google-scholar.php | 6 ++-- 4 files changed, 29 insertions(+), 29 deletions(-) diff --git a/Core/Models/Feed.php b/Core/Models/Feed.php index fc93706c..30d34891 100644 --- a/Core/Models/Feed.php +++ b/Core/Models/Feed.php @@ -207,8 +207,8 @@ public function get_next_scheduled_retrieval_string() { */ public function schedule_retrieval( $args = [] ) { // Determine if this is a Google Scholar feed. - $feeds_schema = pressforward( 'schema.feeds' ); - $feed_type = $feeds_schema->get_pf_feed_type( $this->get( 'id' ) ); + $feeds_schema = pressforward( 'schema.feeds' ); + $feed_type = $feeds_schema->get_pf_feed_type( $this->get( 'id' ) ); $is_google_scholar = in_array( $feed_type, [ 'google-scholar', 'google-scholar-keyword', 'google-scholar-author' ], true ); // Set default interval and nextrun based on feed type. diff --git a/Core/Utility/GoogleScholarRateLimiter.php b/Core/Utility/GoogleScholarRateLimiter.php index 0cfb4553..d6b81638 100644 --- a/Core/Utility/GoogleScholarRateLimiter.php +++ b/Core/Utility/GoogleScholarRateLimiter.php @@ -102,11 +102,11 @@ public static function is_request_allowed() { $timestamps = self::get_request_timestamps(); $timestamps = self::cleanup_old_timestamps( $timestamps ); - $now = time(); - $one_hour_ago = $now - HOUR_IN_SECONDS; - $one_day_ago = $now - DAY_IN_SECONDS; - $max_per_hour = self::get_max_requests_per_hour(); - $max_per_day = self::get_max_requests_per_day(); + $now = time(); + $one_hour_ago = $now - HOUR_IN_SECONDS; + $one_day_ago = $now - DAY_IN_SECONDS; + $max_per_hour = self::get_max_requests_per_hour(); + $max_per_day = self::get_max_requests_per_day(); // Count requests in the last hour. $requests_last_hour = count( @@ -131,34 +131,34 @@ function ( $timestamp ) use ( $one_day_ago ) { // Check hourly limit. if ( $requests_last_hour >= $max_per_hour ) { return [ - 'allowed' => false, - 'reason' => 'hourly_limit', - 'message' => sprintf( + 'allowed' => false, + 'reason' => 'hourly_limit', + 'message' => sprintf( // translators: 1: number of requests, 2: time period. __( 'Google Scholar request limit reached: %1$d requests per %2$s. Please try again later.', 'pressforward' ), $max_per_hour, __( 'hour', 'pressforward' ) ), - 'requests_count' => $requests_last_hour, - 'limit' => $max_per_hour, - 'retry_after' => self::get_retry_after_seconds( $timestamps, HOUR_IN_SECONDS, $max_per_hour ), + 'requests_count' => $requests_last_hour, + 'limit' => $max_per_hour, + 'retry_after' => self::get_retry_after_seconds( $timestamps, HOUR_IN_SECONDS, $max_per_hour ), ]; } // Check daily limit. if ( $requests_last_day >= $max_per_day ) { return [ - 'allowed' => false, - 'reason' => 'daily_limit', - 'message' => sprintf( + 'allowed' => false, + 'reason' => 'daily_limit', + 'message' => sprintf( // translators: 1: number of requests, 2: time period. __( 'Google Scholar request limit reached: %1$d requests per %2$s. Please try again later.', 'pressforward' ), $max_per_day, __( 'day', 'pressforward' ) ), - 'requests_count' => $requests_last_day, - 'limit' => $max_per_day, - 'retry_after' => self::get_retry_after_seconds( $timestamps, DAY_IN_SECONDS, $max_per_day ), + 'requests_count' => $requests_last_day, + 'limit' => $max_per_day, + 'retry_after' => self::get_retry_after_seconds( $timestamps, DAY_IN_SECONDS, $max_per_day ), ]; } @@ -222,11 +222,11 @@ public static function get_status() { $timestamps = self::get_request_timestamps(); $timestamps = self::cleanup_old_timestamps( $timestamps ); - $now = time(); - $one_hour_ago = $now - HOUR_IN_SECONDS; - $one_day_ago = $now - DAY_IN_SECONDS; - $max_per_hour = self::get_max_requests_per_hour(); - $max_per_day = self::get_max_requests_per_day(); + $now = time(); + $one_hour_ago = $now - HOUR_IN_SECONDS; + $one_day_ago = $now - DAY_IN_SECONDS; + $max_per_hour = self::get_max_requests_per_hour(); + $max_per_day = self::get_max_requests_per_day(); $requests_last_hour = count( array_filter( diff --git a/Core/Utility/Retrieval.php b/Core/Utility/Retrieval.php index e08707f4..af382ee4 100644 --- a/Core/Utility/Retrieval.php +++ b/Core/Utility/Retrieval.php @@ -207,8 +207,8 @@ public function ajax_update_feed_handler() { } // Check if this is a Google Scholar feed and if rate limited. - $feeds_schema = pressforward( 'schema.feeds' ); - $feed_type = $feeds_schema->get_pf_feed_type( $post_id ); + $feeds_schema = pressforward( 'schema.feeds' ); + $feed_type = $feeds_schema->get_pf_feed_type( $post_id ); $is_google_scholar = in_array( $feed_type, [ 'google-scholar', 'google-scholar-keyword', 'google-scholar-author' ], true ); if ( $is_google_scholar ) { diff --git a/modules/google-scholar/google-scholar.php b/modules/google-scholar/google-scholar.php index 6b414fb4..4f0b55c1 100644 --- a/modules/google-scholar/google-scholar.php +++ b/modules/google-scholar/google-scholar.php @@ -54,9 +54,9 @@ public static function is_rate_limited_response( $response ) { if ( $body ) { // Check for common indicators of Google's rate limiting page. if ( false !== stripos( $body, '/sorry/' ) || - false !== stripos( $body, 'automated queries' ) || - false !== stripos( $body, 'unusual traffic' ) || - false !== stripos( $body, 'recaptcha' ) ) { + false !== stripos( $body, 'automated queries' ) || + false !== stripos( $body, 'unusual traffic' ) || + false !== stripos( $body, 'recaptcha' ) ) { return true; } } From c2ba62e191179776329602ce074d2a88a3a44a06 Mon Sep 17 00:00:00 2001 From: Boone B Gorges Date: Wed, 28 Jan 2026 12:43:14 -0600 Subject: [PATCH 09/10] Fix if/else formatting for PHPCS. --- Core/Schema/Feeds.php | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/Core/Schema/Feeds.php b/Core/Schema/Feeds.php index b980a4b7..84caebc5 100644 --- a/Core/Schema/Feeds.php +++ b/Core/Schema/Feeds.php @@ -778,11 +778,9 @@ public function validate_feed_cb() { $retval['feedUrl'] = $validated['feedUrl']; wp_send_json_success( $retval ); - } else { + } elseif ( ! empty( $validated['message'] ) ) { // Validation failed - use the message from validate_feed(). - if ( ! empty( $validated['message'] ) ) { - $retval['message'] = $validated['message']; - } + $retval['message'] = $validated['message']; } wp_send_json_error( $retval ); From 2a82f641442ce0982aa701887cbcc56587ef5d45 Mon Sep 17 00:00:00 2001 From: Boone B Gorges Date: Wed, 28 Jan 2026 14:24:26 -0600 Subject: [PATCH 10/10] Remove unneeded return statement. `wp_send_json_error()` exits. --- Core/Utility/Retrieval.php | 1 - 1 file changed, 1 deletion(-) diff --git a/Core/Utility/Retrieval.php b/Core/Utility/Retrieval.php index af382ee4..5c885b16 100644 --- a/Core/Utility/Retrieval.php +++ b/Core/Utility/Retrieval.php @@ -221,7 +221,6 @@ public function ajax_update_feed_handler() { 'retry_after' => $rate_limit_check['retry_after'], ] ); - return; } }