Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Github scraper initial #38

Open
wants to merge 16 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions score/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import threading
from data_retrieval.json_scraper import scrape_json
from data_retrieval.web_scraper import scrape_web
from Score.score.data_retrieval.github_scraper import scrape_github_data
from logger import setup_logger


Expand Down Expand Up @@ -99,5 +100,27 @@ def scrape_pypi_both(start, end):
click.echo("Scraping completed.")


@cli.command()
@click.option(
"--start",
required=True,
help="Enter the starting letter or number to scrape (e.g., 'a' or '0').",
)
@click.option(
"--end",
required=True,
help="Enter the ending letter or number to scrape (e.g., 'c' or '9').",
)
def scrape_github(start, end):
letters_to_scrape = get_letter_range(start, end)

# Prepare the config
config = {"letters": letters_to_scrape}

setup_logger()
scrape_github_data(config)
click.echo("Scraping completed.")


if __name__ == "__main__":
cli()
129 changes: 129 additions & 0 deletions score/data_retrieval/github_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import os
import re
import pandas as pd
import requests
from pyarrow import parquet as pq
from tqdm import tqdm
from logger import setup_logger

logger = setup_logger()

# Constants
GITHUB_API_URL = "https://api.github.com/repos/"
AUTH_HEADER = {"Authorization": ""}
# AUTH_HEADER = {"Authorization": os.getenv("GITHUB_TOKEN", "")} # Use an environment variable for the GitHub token

# Fields to extract from the GitHub API response
FIELDS_TO_EXTRACT = {
"created_at": "created_at",
"updated_at": "updated_at",
"pushed_at": "pushed_at",
"stargazers_count": "stargazers_count",
"forks_count": "forks_count",
"open_issues_count": "open_issues_count",
"subscribers_count": "subscribers_count",
"watchers_count": "watchers_count",
"releases_url": "releases_url",
"commits_url": "commits_url",
"collaborators_url": "collaborators_url",
"contributors_url": "contributors_url",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why store this url? can we fetch a list of collaborators instead?

srossross marked this conversation as resolved.
Show resolved Hide resolved
"license.name": "license",
}


def fetch_github_data(repo_url):
"""
Fetches data from the GitHub API for a given repository URL and extracts specified fields.

Args:
repo_url (str): The GitHub repository URL.

Returns:
dict: A dictionary containing the extracted data fields.
"""
repo_name = "/".join(repo_url.split("/")[-2:])
response = requests.get(GITHUB_API_URL + repo_name, headers=AUTH_HEADER)
srossross marked this conversation as resolved.
Show resolved Hide resolved
if response.status_code == 200:
data = response.json()
extracted_data = {}
for key, field in FIELDS_TO_EXTRACT.items():
if "." in key:
top_level_key, nested_key = key.split(".")
top_level_data = data.get(top_level_key, {})
if isinstance(top_level_data, dict):
extracted_data[field] = top_level_data.get(nested_key, None)
else:
extracted_data[field] = None
else:
extracted_data[field] = data.get(key, None)
return extracted_data
else:
logger.error(f"Failed to fetch data for {repo_url}: {response.status_code}")
return None


def scrape_github_data(config):
"""
Scrapes GitHub data for packages specified by the configuration.

Args:
config (dict): Configuration dictionary containing letters to scrape.
"""
letters_to_scrape = config["letters"]
all_data = []

for letter in letters_to_scrape:
directory = f"output/json/first_letter={letter}"
if os.path.exists(directory):
for file_name in os.listdir(directory):
if file_name.endswith(".parquet"):
file_path = os.path.join(directory, file_name)
df = pq.read_table(file_path).to_pandas()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this from pypi? we probably should update the output name to output/pypi/json to be more clear

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also you should use hive partiionin here

pd.read_parquet(
    directory,
    filters=[("first_letter", "==",  letters_to_scrape)],
)

You will have to update the filters work the "==" will not work with the list of letters

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

a few more thoughts.

The source list of github urls will not be exclusive to pypi, what if there is a conda package that is not in pypi?
we may want to to a separate step to create an ouptput like github_urls.parquet

as a side note this can also be done in duckdb like so:

-- Main query to process the data and return GitHub URLs with first_letter filter in package_data CTE
WITH pypi_package_data AS (
    SELECT 
        first_letter,
        project_urls,
        home_page
    FROM read_parquet('output/json/first_letter=*/**.parquet')
    WHERE first_letter IN ('a', 'b', 'c')  -- Replace with your desired letters
),
pypi_github_urls AS (
    SELECT 
        COALESCE(
            json_extract(project_urls, '$.Source'),
            json_extract(project_urls, '$.Homepage'),
            home_page
        ) AS source_url
    FROM pypi_package_data
)
SELECT DISTINCT source_url
FROM pypi_github_urls
WHERE source_url LIKE '%github.com%'
ORDER BY source_url;


# Reconstruct project_urls from flattened columns
df["project_urls"] = df.filter(like="project_urls.").apply(
lambda row: {
col.split(".")[-1]: row[col]
for col in row.index
if pd.notna(row[col])
},
axis=1,
)

for _, row in tqdm(
df.iterrows(), total=len(df), desc=f"Processing letter {letter}"
):
package_name = row.get("name")

# Get the GitHub URL from project_urls or home_page
source_url = row.get("project_urls", {}).get("Some_identifier")
if not source_url or "github.com" not in source_url:
source_url = row.get("home_page")

# Ensure the URL is in the correct format
if source_url and "github.com" in source_url:
repo_match = re.match(
r"https?://github\.com/[^/]+/[^/]+", source_url
)
if repo_match:
data = fetch_github_data(repo_match.group())
if data:
data["first_letter"] = letter
data["package_name"] = (
package_name # Add the package name
)
all_data.append(data)

# Save the scraped data to a parquet file
if all_data:
output_df = pd.DataFrame(all_data)
output_dir = "output/github"
if not os.path.exists(output_dir):
os.makedirs(output_dir)
output_file = os.path.join(output_dir, "github_data.parquet")
output_df.to_parquet(output_file, partition_cols=["first_letter"])
logger.info(
"Scraping completed and data saved to output/github/github_data.parquet"
)
else:
logger.info("No valid GitHub URLs found or failed to fetch data.")
1 change: 1 addition & 0 deletions score/data_retrieval/json_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ def process_packages_by_letter(letter, package_names, output_dir):
all_package_data = []
for package_name in tqdm(letter_package_names, desc=f"Processing letter {letter}"):
package_data = get_package_data(package_name)
df = pd.json_normalize(package_data)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is this doing?

if package_data:
all_package_data.append(package_data)

Expand Down
31 changes: 0 additions & 31 deletions score/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,37 +2,6 @@
import re


def input_formatter(letters_str):
"""
Formats an input string into a sorted set of letters and digits.
Allows for the scraper to scrape specific ranges of letters.

Args:
letters_str (str): A string containing ranges or individual characters (e.g., "a-d,0-3").

Returns:
str: A sorted string of individual characters representing the input ranges.
"""
letters = set()
if not letters_str:
letters_str = "0-9,a-z" # Default range if no input is provided
for part in letters_str.split(","):
part = part.strip()
if "-" in part:
start, end = part.split("-")
start, end = start.strip(), end.strip()
if start.isdigit() and end.isdigit():
# Add all digits in the specified range to the set
letters.update(str(i) for i in range(int(start), int(end) + 1))
elif start.isalpha() and end.isalpha():
# Add all letters in the specified range to the set
letters.update(chr(i) for i in range(ord(start), ord(end) + 1))
else:
# Add individual characters to the set
letters.add(part)
return "".join(sorted(letters))


def get_all_package_names():
"""
Fetches the list of all package names from the PyPI Simple API.
Expand Down
Loading