Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Github scraper initial #38

Open
wants to merge 16 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 32 additions & 1 deletion score/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@

import click
import duckdb

import pandas as pd
from .conda.get_conda_package_names import get_conda_package_names
from .conda.scrape_conda import scrape_conda
from .data_retrieval.json_scraper import scrape_json
from .data_retrieval.web_scraper import scrape_web
from .github.github_scraper import scrape_github_data
from .logger import setup_logger
from .utils.get_pypi_package_list import get_pypi_package_names
from .vulnerabilities.scrape_vulnerabilities import scrape_vulnerabilities
Expand Down Expand Up @@ -82,6 +83,36 @@ def scrape_pypi_web(num_partitions, partition, output):
click.echo("Scraping completed.")


@cli.command()
@click.option(
"-i",
"--input",
default=os.path.join(OUTPUT_ROOT, "source-urls.parquet"),
help="The input file containing the GitHub URLs",
)
@click.option(
"-o",
"--output",
default=os.path.join(OUTPUT_ROOT, "github-details.parquet"),
help="The output file to save the detailed GitHub data",
)
def scrape_github(input, output):
click.echo("Scraping GitHub data.")

# Read the input Parquet file using pandas
df = pd.read_parquet(input)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you please partition the data? this would have 30K urls in it. does this work locally?


if df.empty:
click.echo("No valid GitHub URLs found in the input file.")
return

# Call the scrape_github_data function to process the data
result_df = scrape_github_data(df)

click.echo(f"Saving data to {output}")
result_df.to_parquet(output)


@cli.command()
@click.option(
"--output",
Expand Down
100 changes: 100 additions & 0 deletions score/github/github_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import os
import pandas as pd
import requests
from tqdm import tqdm
import logging
from ..utils.common import extract_and_map_fields

log = logging.getLogger(__name__)

# Constants
GITHUB_API_URL = "https://api.github.com/repos/"
AUTH_HEADER = {"Authorization": f"token {os.getenv('GITHUB_TOKEN', '')}"}

# Fields to extract from the GitHub API response
FIELDS_TO_EXTRACT = {
"created_at": "created_at",
"updated_at": "updated_at",
"pushed_at": "pushed_at",
"stargazers_count": "stargazers_count",
"forks_count": "forks_count",
"open_issues_count": "open_issues_count",
"subscribers_count": "subscribers_count",
"watchers_count": "watchers_count",
"contributors_url": "contributors_url",
"license.name": "license",
}


def fetch_github_data(repo_url):
"""
Fetches data from the GitHub API for a given repository URL and extracts specified fields.
Handles cases where the repository is not found (404 error) and returns a record indicating the URL is broken.

Args:
repo_url (str): The GitHub repository URL.

Returns:
dict: A dictionary containing the extracted data fields or an indication that the URL is broken.
"""
repo_name = "/".join(repo_url.split("/")[-2:])
response = requests.get(GITHUB_API_URL + repo_name, headers=AUTH_HEADER)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use the get_session that will add a retry to the request for 500 errors


if response.status_code == 404:
log.debug(f"Repository not found for URL {repo_url}")
return {"source_url": repo_url, "broken_url": True, "error": "404 Not Found"}

response.raise_for_status() # Raise an error for bad status codes
data = response.json()

# Use the extract_and_map_fields function to map the desired fields
extracted_data = extract_and_map_fields(data, map=FIELDS_TO_EXTRACT)

# Fetch additional details for contributors
if contributors_url := data.get("contributors_url"):
contributors_response = requests.get(contributors_url, headers=AUTH_HEADER)
if contributors_response.status_code == 200:
contributors = contributors_response.json()
extracted_data["contributors"] = contributors
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is the contributors data structure? do we need to store this information to create the score?

extracted_data["contributors_count"] = len(contributors)
else:
log.debug(f"Failed to fetch contributors for URL {repo_url}")

# Drop the contributors_url from extracted_data if it exists
extracted_data.pop("contributors_url", None)

# Ensure the source_url is always included in the data
extracted_data["source_url"] = repo_url
extracted_data["broken_url"] = False

return extracted_data


def scrape_github_data(df: pd.DataFrame) -> pd.DataFrame:
"""
Initiates the scraping process using the GitHub API for the provided DataFrame.

Args:
df (pd.DataFrame): A DataFrame containing the GitHub URLs.

Returns:
pd.DataFrame: A DataFrame containing the scraped data.
"""

if df.empty:
log.debug("No valid GitHub URLs found in the input file")
return pd.DataFrame()
Comment on lines +84 to +86
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this should be an error


all_repo_data = []

# Iterate over the DataFrame rows and fetch data from GitHub API
for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing GitHub URLs"):
source_url = row.get("source_url") # Use .get() to safely access the value
if source_url: # Check if source_url is not None or empty
data = fetch_github_data(source_url)
if data:
all_repo_data.append(data)
else:
log.debug(f"Skipping row with missing source_url: {row}")

return pd.DataFrame(all_repo_data)
24 changes: 24 additions & 0 deletions score/utils/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
def extract_and_map_fields(data: dict, map: dict) -> dict:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please rename this file to extract_and_map_fields.py or collections.py having a /util/common.py usually results in this file becoming very large over time

"""
Extracts and maps fields from the provided data based on the given mapping.

Args:
data (dict): The data dictionary from which to extract fields.
map (dict): The mapping of keys to extract from the data.

Returns:
dict: A dictionary containing the extracted and mapped fields.
"""
extracted_data = {}
for key, field in map.items():
if "." in key:
top_level_key, nested_key = key.split(".")
top_level_data = data.get(top_level_key, {})
extracted_data[field] = (
top_level_data.get(nested_key)
if isinstance(top_level_data, dict)
else None
)
else:
extracted_data[field] = data.get(key)
return extracted_data
Loading