diff --git a/Data/db/repository_data_UCSD_database.db b/Data/db/repository_data_UCSD_database.db new file mode 100644 index 0000000..8cee796 Binary files /dev/null and b/Data/db/repository_data_UCSD_database.db differ diff --git a/repofinder/__pycache__/__init__.cpython-313.pyc b/repofinder/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000..7fcda6f Binary files /dev/null and b/repofinder/__pycache__/__init__.cpython-313.pyc differ diff --git a/repofinder/scraping/__pycache__/__init__.cpython-313.pyc b/repofinder/scraping/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000..50cabb4 Binary files /dev/null and b/repofinder/scraping/__pycache__/__init__.cpython-313.pyc differ diff --git a/repofinder/scraping/__pycache__/get_contributors.cpython-313.pyc b/repofinder/scraping/__pycache__/get_contributors.cpython-313.pyc new file mode 100644 index 0000000..7962f95 Binary files /dev/null and b/repofinder/scraping/__pycache__/get_contributors.cpython-313.pyc differ diff --git a/repofinder/scraping/__pycache__/get_organizations.cpython-313.pyc b/repofinder/scraping/__pycache__/get_organizations.cpython-313.pyc new file mode 100644 index 0000000..930fb0d Binary files /dev/null and b/repofinder/scraping/__pycache__/get_organizations.cpython-313.pyc differ diff --git a/repofinder/scraping/__pycache__/get_repo_extras.cpython-313.pyc b/repofinder/scraping/__pycache__/get_repo_extras.cpython-313.pyc new file mode 100644 index 0000000..69690f6 Binary files /dev/null and b/repofinder/scraping/__pycache__/get_repo_extras.cpython-313.pyc differ diff --git a/repofinder/scraping/__pycache__/get_repositories.cpython-313.pyc b/repofinder/scraping/__pycache__/get_repositories.cpython-313.pyc new file mode 100644 index 0000000..fc4ee24 Binary files /dev/null and b/repofinder/scraping/__pycache__/get_repositories.cpython-313.pyc differ diff --git a/repofinder/scraping/__pycache__/json_to_db.cpython-313.pyc b/repofinder/scraping/__pycache__/json_to_db.cpython-313.pyc new file mode 100644 index 0000000..73467ba Binary files /dev/null and b/repofinder/scraping/__pycache__/json_to_db.cpython-313.pyc differ diff --git a/repofinder/scraping/__pycache__/repo_scraping_utils.cpython-313.pyc b/repofinder/scraping/__pycache__/repo_scraping_utils.cpython-313.pyc new file mode 100644 index 0000000..7144748 Binary files /dev/null and b/repofinder/scraping/__pycache__/repo_scraping_utils.cpython-313.pyc differ diff --git a/repofinder/scraping/repo_scraping_utils.py b/repofinder/scraping/repo_scraping_utils.py index 322ffb8..85e5bee 100644 --- a/repofinder/scraping/repo_scraping_utils.py +++ b/repofinder/scraping/repo_scraping_utils.py @@ -10,77 +10,43 @@ logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) - GITHUB_API_URL = "https://api.github.com" MAX_RETRIES = 1 RETRY_DELAY = 2 # seconds - def github_api_request(url, headers, params=None): """ Sends a GET request to the GitHub API with rate limit handling. - - Parameters - ---------- - url : str - The API endpoint URL. - headers : dict - HTTP headers for the request. - params : dict, optional - Query parameters for the request (default is None). - - Returns - ------- - tuple - A tuple containing: - - dict: The JSON response from the API. - - dict: The response headers. """ for attempt in range(1, MAX_RETRIES + 1): logger.debug(f"Attempt {attempt} for URL: {url}") try: response = requests.get(url, headers=headers, params=params, timeout=10) + logger.debug(f"Response status code: {response.status_code}") response.raise_for_status() - except: - pass - logger.debug(f"Response status code: {response.status_code}") - if response.status_code == 200: - logger.debug("Successful response.") - return response.json(), response.headers - elif response.status_code == 404: - logger.warning(f"Resource not found: {url}. Exiting without retry.") - return None, None - elif response.status_code == 403 and 'X-RateLimit-Remaining' in response.headers: - if response.headers['X-RateLimit-Remaining'] == '0': - reset_time = int(response.headers.get('X-RateLimit-Reset', time.time())) - sleep_time = max(reset_time - int(time.time()), 1) # Avoid negative sleep times - logger.warning(f"Rate limit exceeded. Sleeping for {sleep_time} seconds.") - time.sleep(sleep_time) - continue # Retry after sleeping - else: - logger.error(f"Error: {response.status_code} - {response.reason}") - if attempt == MAX_RETRIES: - response.raise_for_status() + + if response.status_code == 200: + logger.debug("Successful response.") + return response.json(), response.headers + elif response.status_code == 404: + logger.warning(f"Resource not found: {url}. Exiting without retry.") + return None, None + elif response.status_code == 403 and 'X-RateLimit-Remaining' in response.headers: + if response.headers['X-RateLimit-Remaining'] == '0': + reset_time = int(response.headers.get('X-RateLimit-Reset', time.time())) + sleep_time = max(reset_time - int(time.time()), 1) + logger.warning(f"Rate limit exceeded. Sleeping for {sleep_time} seconds.") + time.sleep(sleep_time) + continue + except Exception as e: + logger.error(f"Error requesting {url}: {e}") time.sleep(RETRY_DELAY) continue + logger.error(f"Failed to get a successful response after {MAX_RETRIES} attempts: {url}") return None, None - def get_next_link(headers): - """ - Parses the 'Link' header from a GitHub API response to find the next page URL. - - Parameters - ---------- - headers : dict - Response headers from the GitHub API. - - Returns - ------- - str or None - The URL for the next page if available, otherwise None. - """ link_header = headers.get('Link', '') if not link_header: return None @@ -96,48 +62,17 @@ def get_next_link(headers): return next_url return None - def build_queries(config_file): - """ - Builds a list of GitHub search queries based on university-related metadata. - - Parameters - ---------- - env : str - The path to the JSON configuration file containing university details. - - Returns - ------- - tuple - A tuple containing: - - list of str: Search query terms for GitHub. - - str: The university acronym for output file naming. - - Notes - ----- - - Reads university metadata from the provided JSON file. - - Constructs query terms based on the university's name, acronym, email domain, and website. - - Uses `itertools.product` to generate query combinations. - - Ensures query terms are sanitized to prevent malformed queries. - - """ - with open(config_file, encoding="utf-8") as envfile: config = json.load(envfile) - # Assign values to variables using keys from the config university_name = config["UNIVERSITY_NAME"] university_acronym = config["UNIVERSITY_ACRONYM"] university_email_domain = config["UNIVERSITY_EMAIL_DOMAIN"] additional_queries = config["ADDITIONAL_QUERIES"] - - # Define search fields search_fields = ["in:name", "in:description", "in:readme", "in:tags"] - - # Combine university metadata and additional queries query_terms_list = [university_name, university_acronym, university_email_domain] + additional_queries - # Generate query terms with itertools.product query_terms = [ f'"{term}" {field}' for term, field in itertools.product(query_terms_list, search_fields) @@ -145,21 +80,8 @@ def build_queries(config_file): return query_terms, university_acronym - - - def search_repositories_with_queries(query_terms, headers): - """ - Searches GitHub repositories based on query terms and records matching queries. - - Args: - query_terms (list): List of query strings. - headers (dict): HTTP headers for the request. - - Returns: - dict: A dictionary of repositories with their matching queries. - """ - repositories = [] # TODO: Figure out what to do with duplicates + repositories = [] for query_term in query_terms: params = {'q': query_term, 'per_page': 100} url = f"{GITHUB_API_URL}/search/repositories" @@ -170,15 +92,13 @@ def search_repositories_with_queries(query_terms, headers): except Exception as e: logger.error(f"Error searching repositories: {e}") break - if data: # TODO: Figure out caching + + if data: items = data.get('items', []) repositories.extend(items) next_url = get_next_link(headers_response) url = next_url - params = None # Parameters are only needed for the initial request + params = None else: break - return repositories - - - + return repositories \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 685646a..8346fa6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,10 @@ -python-dotenv>=0.9.9 -matplotlib>=3.9.4 -numpy>=1.26.4,<2.0 -pandas>=2.2.3 -requests>=2.32.3 -openai>=1.75.0 -scipy>=1.15.2 -scikit-learn>=1.6.1 -imbalanced-learn>=0.13.0 -seaborn>=0.13.2 \ No newline at end of file +python-dotenv +matplotlib +numpy +pandas +requests +openai +scipy +scikit-learn +imbalanced-learn +seaborn \ No newline at end of file