diff --git a/Core/Models/Feed.php b/Core/Models/Feed.php index 00454dcd..30d34891 100644 --- a/Core/Models/Feed.php +++ b/Core/Models/Feed.php @@ -206,10 +206,19 @@ public function get_next_scheduled_retrieval_string() { * @return true|\WP_Error True if scheduled, WP_Error if not. See wp_schedule_event(). */ public function schedule_retrieval( $args = [] ) { + // Determine if this is a Google Scholar feed. + $feeds_schema = pressforward( 'schema.feeds' ); + $feed_type = $feeds_schema->get_pf_feed_type( $this->get( 'id' ) ); + $is_google_scholar = in_array( $feed_type, [ 'google-scholar', 'google-scholar-keyword', 'google-scholar-author' ], true ); + + // Set default interval and nextrun based on feed type. + $default_interval = $is_google_scholar ? 'pf_google_scholar_interval' : 'pf_interval'; + $default_nextrun = $is_google_scholar ? time() + ( wp_rand( 0, 120 ) * MINUTE_IN_SECONDS ) : time() + ( wp_rand( 0, 30 ) * MINUTE_IN_SECONDS ); + $r = array_merge( [ - 'interval' => 'pf_interval', - 'nextrun' => time() + ( wp_rand( 0, 30 ) * MINUTE_IN_SECONDS ), + 'interval' => $default_interval, + 'nextrun' => $default_nextrun, ], $args ); diff --git a/Core/Schema/Feeds.php b/Core/Schema/Feeds.php index 5508a26b..84caebc5 100644 --- a/Core/Schema/Feeds.php +++ b/Core/Schema/Feeds.php @@ -12,6 +12,8 @@ use Intraxia\Jaxion\Contract\Core\HasFilters; use PressForward\Controllers\Metas; use PressForward\Core\Models\Feed; +use PressForward\Core\Utility\GoogleScholarRateLimiter; +use PF_Google_Scholar; /** * Database class for manipulating feed. @@ -776,6 +778,9 @@ public function validate_feed_cb() { $retval['feedUrl'] = $validated['feedUrl']; wp_send_json_success( $retval ); + } elseif ( ! empty( $validated['message'] ) ) { + // Validation failed - use the message from validate_feed(). + $retval['message'] = $validated['message']; } wp_send_json_error( $retval ); @@ -817,12 +822,40 @@ public static function validate_feed( $url ) { switch ( $feed_type ) { case 'google-scholar-author': case 'google-scholar-keyword': - $request = wp_remote_get( $url ); + // Check rate limit before making request. + $rate_limit_check = GoogleScholarRateLimiter::is_request_allowed(); + if ( true !== $rate_limit_check ) { + $retval['message'] = $rate_limit_check['message']; + break; + } + + $request = wp_remote_get( + $url, + [ + 'user-agent' => PF_Google_Scholar::USER_AGENT, + ] + ); + if ( is_wp_error( $request ) ) { $retval['message'] = $request->get_error_message(); + } elseif ( PF_Google_Scholar::is_rate_limited_response( $request ) ) { + // Detected Google's rate limiting page. + $retval['message'] = __( 'Google Scholar is rate limiting requests from your server. Please try again later.', 'pressforward' ); } elseif ( 200 !== wp_remote_retrieve_response_code( $request ) ) { - $retval['message'] = __( 'The URL returned an error.', 'pressforward' ); + $response_code = wp_remote_retrieve_response_code( $request ); + if ( 429 === $response_code ) { + $retval['message'] = __( 'Google Scholar is rate limiting requests. Please try again later.', 'pressforward' ); + } else { + $retval['message'] = sprintf( + // translators: %d is the HTTP response code. + __( 'The URL returned an error (HTTP %d).', 'pressforward' ), + $response_code + ); + } } else { + // Record successful request only if not rate limited. + GoogleScholarRateLimiter::record_request(); + $retval['success'] = true; $retval['feedUrl'] = $url; $retval['message'] = 'google-scholar-author' === $feed_type ? __( 'Google Scholar author feed detected.', 'pressforward' ) : __( 'Google Scholar keyword feed detected.', 'pressforward' ); diff --git a/Core/Utility/GoogleScholarRateLimiter.php b/Core/Utility/GoogleScholarRateLimiter.php new file mode 100644 index 00000000..d6b81638 --- /dev/null +++ b/Core/Utility/GoogleScholarRateLimiter.php @@ -0,0 +1,267 @@ + $cutoff_time; + } + ); + } + + /** + * Check if a request is allowed based on rate limits. + * + * @return bool|array True if allowed, array with error details if not. + */ + public static function is_request_allowed() { + $timestamps = self::get_request_timestamps(); + $timestamps = self::cleanup_old_timestamps( $timestamps ); + + $now = time(); + $one_hour_ago = $now - HOUR_IN_SECONDS; + $one_day_ago = $now - DAY_IN_SECONDS; + $max_per_hour = self::get_max_requests_per_hour(); + $max_per_day = self::get_max_requests_per_day(); + + // Count requests in the last hour. + $requests_last_hour = count( + array_filter( + $timestamps, + function ( $timestamp ) use ( $one_hour_ago ) { + return $timestamp > $one_hour_ago; + } + ) + ); + + // Count requests in the last day. + $requests_last_day = count( + array_filter( + $timestamps, + function ( $timestamp ) use ( $one_day_ago ) { + return $timestamp > $one_day_ago; + } + ) + ); + + // Check hourly limit. + if ( $requests_last_hour >= $max_per_hour ) { + return [ + 'allowed' => false, + 'reason' => 'hourly_limit', + 'message' => sprintf( + // translators: 1: number of requests, 2: time period. + __( 'Google Scholar request limit reached: %1$d requests per %2$s. Please try again later.', 'pressforward' ), + $max_per_hour, + __( 'hour', 'pressforward' ) + ), + 'requests_count' => $requests_last_hour, + 'limit' => $max_per_hour, + 'retry_after' => self::get_retry_after_seconds( $timestamps, HOUR_IN_SECONDS, $max_per_hour ), + ]; + } + + // Check daily limit. + if ( $requests_last_day >= $max_per_day ) { + return [ + 'allowed' => false, + 'reason' => 'daily_limit', + 'message' => sprintf( + // translators: 1: number of requests, 2: time period. + __( 'Google Scholar request limit reached: %1$d requests per %2$s. Please try again later.', 'pressforward' ), + $max_per_day, + __( 'day', 'pressforward' ) + ), + 'requests_count' => $requests_last_day, + 'limit' => $max_per_day, + 'retry_after' => self::get_retry_after_seconds( $timestamps, DAY_IN_SECONDS, $max_per_day ), + ]; + } + + return true; + } + + /** + * Get the number of seconds until the next request can be made. + * + * @param array $timestamps Array of Unix timestamps. + * @param int $period Time period in seconds. + * @param int $limit Maximum requests in the period. + * @return int Seconds until retry is allowed. + */ + protected static function get_retry_after_seconds( $timestamps, $period, $limit ) { + $now = time(); + $cutoff_time = $now - $period; + + $recent_timestamps = array_filter( + $timestamps, + function ( $timestamp ) use ( $cutoff_time ) { + return $timestamp > $cutoff_time; + } + ); + + if ( count( $recent_timestamps ) < $limit ) { + return 0; + } + + // Sort timestamps in ascending order. + sort( $recent_timestamps ); + + // The oldest timestamp in the period. + $oldest_timestamp = reset( $recent_timestamps ); + + // Calculate when that timestamp will fall outside the period. + $retry_after = ( $oldest_timestamp + $period ) - $now; + + return max( 0, $retry_after ); + } + + /** + * Record a request timestamp. + * + * @return bool True on success, false on failure. + */ + public static function record_request() { + $timestamps = self::get_request_timestamps(); + $timestamps = self::cleanup_old_timestamps( $timestamps ); + $timestamps[] = time(); + + return self::save_request_timestamps( $timestamps ); + } + + /** + * Get current rate limit status information. + * + * @return array Status information. + */ + public static function get_status() { + $timestamps = self::get_request_timestamps(); + $timestamps = self::cleanup_old_timestamps( $timestamps ); + + $now = time(); + $one_hour_ago = $now - HOUR_IN_SECONDS; + $one_day_ago = $now - DAY_IN_SECONDS; + $max_per_hour = self::get_max_requests_per_hour(); + $max_per_day = self::get_max_requests_per_day(); + + $requests_last_hour = count( + array_filter( + $timestamps, + function ( $timestamp ) use ( $one_hour_ago ) { + return $timestamp > $one_hour_ago; + } + ) + ); + + $requests_last_day = count( + array_filter( + $timestamps, + function ( $timestamp ) use ( $one_day_ago ) { + return $timestamp > $one_day_ago; + } + ) + ); + + return [ + 'requests_last_hour' => $requests_last_hour, + 'max_requests_per_hour' => $max_per_hour, + 'requests_last_day' => $requests_last_day, + 'max_requests_per_day' => $max_per_day, + 'hourly_remaining' => max( 0, $max_per_hour - $requests_last_hour ), + 'daily_remaining' => max( 0, $max_per_day - $requests_last_day ), + ]; + } + + /** + * Reset all stored request timestamps. + * + * @return bool True on success, false on failure. + */ + public static function reset() { + return delete_option( self::OPTION_NAME ); + } +} diff --git a/Core/Utility/Retrieval.php b/Core/Utility/Retrieval.php index 9f89bcaa..5c885b16 100644 --- a/Core/Utility/Retrieval.php +++ b/Core/Utility/Retrieval.php @@ -8,6 +8,7 @@ namespace PressForward\Core\Utility; use PressForward\Core\Models\Feed; +use PressForward\Core\Utility\GoogleScholarRateLimiter; /** * Feed 'slurping' class. @@ -57,6 +58,13 @@ public function cron_add_short( $schedules ) { 'interval' => $pf_interval * 60, 'display' => __( 'PressForward Retrieval Interval', 'pressforward' ), ); + + // Add daily schedule for Google Scholar feeds. + $schedules['pf_google_scholar_interval'] = array( + 'interval' => DAY_IN_SECONDS, + 'display' => __( 'PressForward Google Scholar Retrieval Interval (Daily)', 'pressforward' ), + ); + return $schedules; } @@ -198,6 +206,24 @@ public function ajax_update_feed_handler() { return; } + // Check if this is a Google Scholar feed and if rate limited. + $feeds_schema = pressforward( 'schema.feeds' ); + $feed_type = $feeds_schema->get_pf_feed_type( $post_id ); + $is_google_scholar = in_array( $feed_type, [ 'google-scholar', 'google-scholar-keyword', 'google-scholar-author' ], true ); + + if ( $is_google_scholar ) { + $rate_limit_check = GoogleScholarRateLimiter::is_request_allowed(); + if ( true !== $rate_limit_check ) { + pf_log( 'Manual Google Scholar refresh rate limit reached: ' . $rate_limit_check['message'] ); + wp_send_json_error( + [ + 'message' => $rate_limit_check['message'], + 'retry_after' => $rate_limit_check['retry_after'], + ] + ); + } + } + $retrieved_status = $feed->retrieve(); $retval = [ diff --git a/modules/google-scholar/google-scholar.php b/modules/google-scholar/google-scholar.php index 229fac88..4f0b55c1 100644 --- a/modules/google-scholar/google-scholar.php +++ b/modules/google-scholar/google-scholar.php @@ -8,11 +8,19 @@ use PressForward\Interfaces\FeedSource; use PressForward\Core\DTO\FeedItem; +use PressForward\Core\Utility\GoogleScholarRateLimiter; /** * PF_Google_Scholar class. */ class PF_Google_Scholar extends PF_Module implements FeedSource { + /** + * User-Agent string to use for Google Scholar requests. + * + * @var string + */ + const USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'; + /** * Constructor. */ @@ -21,6 +29,41 @@ public function __construct() { parent::start(); } + /** + * Detects if a response indicates Google rate limiting. + * + * Google redirects to a CAPTCHA page at /sorry/index when rate limiting. + * This can be a 302 redirect or a 200 response with the sorry page content. + * + * @param array|\WP_Error $response HTTP response from wp_remote_get(). + * @return bool True if rate limiting is detected, false otherwise. + */ + public static function is_rate_limited_response( $response ) { + if ( is_wp_error( $response ) ) { + return false; + } + + // Check for redirect to sorry page. + $redirect_url = wp_remote_retrieve_header( $response, 'location' ); + if ( $redirect_url && false !== stripos( $redirect_url, '/sorry/' ) ) { + return true; + } + + // Check the body for signs of the sorry page (case-insensitive). + $body = wp_remote_retrieve_body( $response ); + if ( $body ) { + // Check for common indicators of Google's rate limiting page. + if ( false !== stripos( $body, '/sorry/' ) || + false !== stripos( $body, 'automated queries' ) || + false !== stripos( $body, 'unusual traffic' ) || + false !== stripos( $body, 'recaptcha' ) ) { + return true; + } + } + + return false; + } + /** * Fetches data from URL. * @@ -28,6 +71,17 @@ public function __construct() { * @return array|\WP_Error */ public function fetch( $feed ) { + // Check rate limit before making request. + $rate_limit_check = GoogleScholarRateLimiter::is_request_allowed(); + if ( true !== $rate_limit_check ) { + pf_log( 'Google Scholar rate limit reached: ' . $rate_limit_check['message'] ); + return [ + 'success' => false, + 'message' => $rate_limit_check['message'], + 'entries' => [], + ]; + } + $url = $feed->get( 'remote_feed_url' ); $is_profile = false !== strpos( $url, 'user=' ); @@ -42,7 +96,7 @@ public function fetch( $feed ) { $url, [ 'timeout' => 30, - 'user-agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36', + 'user-agent' => self::USER_AGENT, ] ); @@ -55,6 +109,16 @@ public function fetch( $feed ) { ]; } + // Check if the response indicates rate limiting by Google. + if ( self::is_rate_limited_response( $response ) ) { + pf_log( 'Google Scholar response indicates rate limiting (CAPTCHA/sorry page)' ); + return [ + 'success' => false, + 'message' => __( 'Google Scholar is rate limiting this request. Please try again later.', 'pressforward' ), + 'entries' => [], + ]; + } + // Get the body content. $html = wp_remote_retrieve_body( $response ); @@ -80,6 +144,9 @@ public function fetch( $feed ) { $entries = $this->parse_items_from_search( $xpath, $feed ); } + // Record successful request. + GoogleScholarRateLimiter::record_request(); + return $entries; } @@ -263,6 +330,14 @@ protected function parse_items_from_search( $xpath, $feed ) { * @param bool $is_new_feed Whether the feed is new. */ public function health_check( \PressForward\Core\Models\Feed $feed, $is_new_feed = false ) { + // Check rate limit before making request. + $rate_limit_check = GoogleScholarRateLimiter::is_request_allowed(); + if ( true !== $rate_limit_check ) { + pf_log( 'Google Scholar health check rate limit reached: ' . $rate_limit_check['message'] ); + // For health checks, we'll skip rather than fail when rate limited. + return; + } + $feed_url = $feed->get( 'remote_feed_url' ); $feed_is_valid = false; @@ -272,12 +347,20 @@ public function health_check( \PressForward\Core\Models\Feed $feed, $is_new_feed $feed_url, [ 'timeout' => 30, - 'user-agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36', + 'user-agent' => self::USER_AGENT, ] ); $body = ''; if ( ! is_wp_error( $response ) ) { + // Check if the response indicates rate limiting by Google. + if ( self::is_rate_limited_response( $response ) ) { + pf_log( 'Google Scholar health check response indicates rate limiting (CAPTCHA/sorry page)' ); + // Don't record this as a successful request. + // Don't mark the feed as invalid, just skip health check. + return; + } + $body = wp_remote_retrieve_body( $response ); // Check if the body contains Google Scholar specific content. @@ -302,6 +385,9 @@ public function health_check( \PressForward\Core\Models\Feed $feed, $is_new_feed $alert_box->dismiss_alert( $feed->get( 'id' ) ); } + // Record successful request. + GoogleScholarRateLimiter::record_request(); + if ( $is_new_feed ) { // Get the feed title from search box HTML, class 'gs_in_txt'. $doc = new DOMDocument(); diff --git a/tests/test-google-scholar-rate-limiter.php b/tests/test-google-scholar-rate-limiter.php new file mode 100644 index 00000000..a3d603d7 --- /dev/null +++ b/tests/test-google-scholar-rate-limiter.php @@ -0,0 +1,141 @@ +assertTrue( $result ); + } + + /** + * Test that requests are blocked after hourly limit is reached. + */ + public function test_hourly_limit_is_enforced() { + $max_per_hour = \PressForward\Core\Utility\GoogleScholarRateLimiter::get_max_requests_per_hour(); + + // Make max requests. + for ( $i = 0; $i < $max_per_hour; $i++ ) { + $this->assertTrue( \PressForward\Core\Utility\GoogleScholarRateLimiter::is_request_allowed() ); + \PressForward\Core\Utility\GoogleScholarRateLimiter::record_request(); + } + + // Next request should be blocked. + $result = \PressForward\Core\Utility\GoogleScholarRateLimiter::is_request_allowed(); + $this->assertIsArray( $result ); + $this->assertFalse( $result['allowed'] ); + $this->assertEquals( 'hourly_limit', $result['reason'] ); + } + + /** + * Test that requests are blocked after daily limit is reached. + */ + public function test_daily_limit_is_enforced() { + // Set a low hourly limit to test daily limit. + update_option( 'pf_google_scholar_max_per_hour', 100 ); + $max_per_day = \PressForward\Core\Utility\GoogleScholarRateLimiter::get_max_requests_per_day(); + + // Make max requests. + for ( $i = 0; $i < $max_per_day; $i++ ) { + \PressForward\Core\Utility\GoogleScholarRateLimiter::record_request(); + } + + // Next request should be blocked by daily limit. + $result = \PressForward\Core\Utility\GoogleScholarRateLimiter::is_request_allowed(); + $this->assertIsArray( $result ); + $this->assertFalse( $result['allowed'] ); + $this->assertEquals( 'daily_limit', $result['reason'] ); + } + + /** + * Test that get_status returns correct information. + */ + public function test_get_status_returns_correct_info() { + // Make some requests. + \PressForward\Core\Utility\GoogleScholarRateLimiter::record_request(); + \PressForward\Core\Utility\GoogleScholarRateLimiter::record_request(); + + $status = \PressForward\Core\Utility\GoogleScholarRateLimiter::get_status(); + + $this->assertIsArray( $status ); + $this->assertEquals( 2, $status['requests_last_hour'] ); + $this->assertEquals( 2, $status['requests_last_day'] ); + $this->assertGreaterThan( 0, $status['max_requests_per_hour'] ); + $this->assertGreaterThan( 0, $status['max_requests_per_day'] ); + } + + /** + * Test that reset clears all timestamps. + */ + public function test_reset_clears_timestamps() { + // Make some requests. + \PressForward\Core\Utility\GoogleScholarRateLimiter::record_request(); + \PressForward\Core\Utility\GoogleScholarRateLimiter::record_request(); + + $status = \PressForward\Core\Utility\GoogleScholarRateLimiter::get_status(); + $this->assertEquals( 2, $status['requests_last_hour'] ); + + // Reset. + \PressForward\Core\Utility\GoogleScholarRateLimiter::reset(); + + $status = \PressForward\Core\Utility\GoogleScholarRateLimiter::get_status(); + $this->assertEquals( 0, $status['requests_last_hour'] ); + $this->assertEquals( 0, $status['requests_last_day'] ); + } + + /** + * Test that custom limits can be configured. + */ + public function test_custom_limits_can_be_configured() { + update_option( 'pf_google_scholar_max_per_hour', 5 ); + update_option( 'pf_google_scholar_max_per_day', 20 ); + + $this->assertEquals( 5, \PressForward\Core\Utility\GoogleScholarRateLimiter::get_max_requests_per_hour() ); + $this->assertEquals( 20, \PressForward\Core\Utility\GoogleScholarRateLimiter::get_max_requests_per_day() ); + } + + /** + * Test that retry_after is calculated correctly. + */ + public function test_retry_after_is_calculated() { + $max_per_hour = \PressForward\Core\Utility\GoogleScholarRateLimiter::get_max_requests_per_hour(); + + // Make max requests. + for ( $i = 0; $i < $max_per_hour; $i++ ) { + \PressForward\Core\Utility\GoogleScholarRateLimiter::record_request(); + } + + // Next request should be blocked with retry_after value. + $result = \PressForward\Core\Utility\GoogleScholarRateLimiter::is_request_allowed(); + $this->assertIsArray( $result ); + $this->assertArrayHasKey( 'retry_after', $result ); + $this->assertGreaterThan( 0, $result['retry_after'] ); + $this->assertLessThanOrEqual( HOUR_IN_SECONDS, $result['retry_after'] ); + } +}