From 39d2a91e3e261c9ec780ec9e35ae1776eb42c9b1 Mon Sep 17 00:00:00 2001 From: Jack Sundberg Date: Thu, 25 Aug 2022 15:36:41 -0400 Subject: [PATCH 1/3] update matproj data --- pyproject.toml | 5 + src/simmate/command_line/database.py | 8 +- .../for_providers/materials_project.py | 175 +++++++++++++----- .../third_parties/materials_project.py | 51 ++++- 4 files changed, 188 insertions(+), 51 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 05138a957..ddfea03aa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -129,6 +129,11 @@ EXTRAS = [ "django-extensions >=3.1.5", # simple tools to help with django development ] +# For downloading third-party data directly from source instead of Simmate +DATA = [ + "mp-api >=0.26.3, <=0.26.3", # materials project +] + # Link to our homepage. Use github for now. [project.urls] homepage = "https://simmate.org/" diff --git a/src/simmate/command_line/database.py b/src/simmate/command_line/database.py index a5dd29e3f..e25f84a00 100644 --- a/src/simmate/command_line/database.py +++ b/src/simmate/command_line/database.py @@ -35,8 +35,8 @@ def reset(confirm_delete: bool = False, use_prebuilt: bool = None): # make sure the user knows what they are doing and actually wants to continue if not confirm_delete: typer.confirm( - "WARNING: This deletes your current database and cannot be undone. \n" - "We highly recommend you make a copy of your database before doing this. \n" + "\nWARNING: This deletes your current database and cannot be undone. \n" + "We highly recommend you make a copy of your database before doing this. \n\n" "Do you still want to continue?", abort=True, ) @@ -48,11 +48,11 @@ def reset(confirm_delete: bool = False, use_prebuilt: bool = None): using_sqlite = DATABASES["default"]["ENGINE"] == "django.db.backends.sqlite3" if using_sqlite and use_prebuilt == None: use_prebuilt = typer.confirm( - "It looks like you are using the default database backend (sqlite3). \n" + "\nIt looks like you are using the default database backend (sqlite3). \n" "Would you like to use a prebuilt-database with all third-party data " "already loaded? \n" "If this is the first time you using the prebuild, this will " - "involve a ~1.5GB \ndownload and will unpack to roughly 5GB.\n" + "involve a ~1.5GB \ndownload and will unpack to roughly 5GB.\n\n" "We recommend answering 'yes' for beginners." ) diff --git a/src/simmate/database/third_parties/for_providers/materials_project.py b/src/simmate/database/third_parties/for_providers/materials_project.py index e2b925244..f33204689 100644 --- a/src/simmate/database/third_parties/for_providers/materials_project.py +++ b/src/simmate/database/third_parties/for_providers/materials_project.py @@ -13,7 +13,6 @@ """ from django.db import transaction -from pymatgen.ext.matproj import MPRester from rich.progress import track from simmate.database.third_parties import MatprojStructure @@ -22,7 +21,6 @@ @transaction.atomic def load_all_structures( api_key: str, - criteria: dict = {"task_id": {"$exists": True}}, update_stabilities: bool = False, ): """ @@ -44,46 +42,107 @@ def load_all_structures( will add over an hour to this process. Default is True. """ - # Notes on filtering criteria for structures in the Materials Project: - # - # Catagories such as 'elements' that we can filter off of are listed here: - # https://github.com/materialsproject/mapidoc - # Conditions such as $in or $exists that we filter based on are listed here: - # https://docs.mongodb.com/manual/reference/operator/query/ - # - # As the simplest example, I only want one structure and I grab it by it's - # mp-id in this code: - # criteria = {"task_id": mp_id} - # - # Here, we want all structures! Which is why we use: - # criteria = {"task_id": {"$exists": True}} - # - # This is an alternative criteria input that can be used for testing - # criteria={ - # "task_id": { - # "$exists": True, - # "$in": ["mp-" + str(n) for n in range(1, 10000)]}, - # } - # + try: + from mp_api.client import MPRester + except: + raise Exception( + "To use this method, MP-API is required. Please install it " + "with `pip install mp-api" + ) # Connect to their database with personal API key - mpr = MPRester(api_key) - - # For the filtered structures, this lists off which properties to grab. - # All possible properties are listed here: - # https://github.com/materialsproject/mapidoc - properties = [ - "material_id", - "final_energy", - "structure", - ] - - # now make the query and grab everything from the Materials Project! - # the output dictionary is given back within a list, where each entry is - # a specific structure (so a single mp-id) - # Note: this is a very large query, so make sure your computer has enough - # memory (RAM >10GB) and a stable internet connection. - data = mpr.query(criteria, properties) + with MPRester(api_key) as mpr: + + # For the filtered structures, this lists off which properties to grab. + # All possible properties can be listed with: + # mpr.summary.available_fields + fields_to_load = [ + # "builder_meta", + # "nsites", + # "elements", + # "nelements", + # "composition", + # "composition_reduced", + # "formula_pretty", + # "formula_anonymous", + # "chemsys", + # "volume", + # "density", + # "density_atomic", + # "symmetry", + # "property_name", + "material_id", + # "deprecated", + # "deprecation_reasons", + "last_updated", + # "origins", + # "warnings", + "structure", + # "task_ids", + "uncorrected_energy_per_atom", + "energy_per_atom", + # "formation_energy_per_atom", + # "energy_above_hull", + # "is_stable", + # "equilibrium_reaction_energy_per_atom", + # "decomposes_to", + # "xas", + # "grain_boundaries", + "band_gap", + # "cbm", + # "vbm", + # "efermi", + "is_gap_direct", + # "is_metal", + # "es_source_calc_id", + # "bandstructure", + # "dos", + # "dos_energy_up", + # "dos_energy_down", + "is_magnetic", + # "ordering", + "total_magnetization", + # "total_magnetization_normalized_vol", + # "total_magnetization_normalized_formula_units", + # "num_magnetic_sites", + # "num_unique_magnetic_sites", + # "types_of_magnetic_species", + # "k_voigt", + # "k_reuss", + # "k_vrh", + # "g_voigt", + # "g_reuss", + # "g_vrh", + # "universal_anisotropy", + # "homogeneous_poisson", + # "e_total", + # "e_ionic", + # "e_electronic", + # "n", + # "e_ij_max", + # "weighted_surface_energy_EV_PER_ANG2", + # "weighted_surface_energy", + # "weighted_work_function", + # "surface_anisotropy", + # "shape_factor", + # "has_reconstructed", + # "possible_species", + # "has_props", + "theoretical", + ] + + # now make the query and grab everything from the Materials Project! + # the output dictionary is given back within a list, where each entry is + # a specific structure (so a single mp-id) + # Note: this is a very large query, so make sure your computer has enough + # memory (RAM >10GB) and a stable internet connection. + data = mpr.summary.search( + all_fields=False, + fields=fields_to_load, + # !!! DEV NOTE: you can uncomment these lines for quick testing + # num_chunks=3, + # chunk_size=250, + ) # Let's iterate through each structure and save it to the database # This also takes a while, so we use a progress bar @@ -91,14 +150,38 @@ def load_all_structures( # convert the data to a Simmate database object structure_db = MatprojStructure.from_toolkit( - id=entry["material_id"], - structure=entry["structure"], - energy=entry["final_energy"], + id=str(entry.material_id), + structure=entry.structure, + energy=entry.energy_per_atom * entry.structure.num_sites, + energy_uncorrected=entry.uncorrected_energy_per_atom + * entry.structure.num_sites, + updated_at=fix_timezone(entry.last_updated), + band_gap=entry.band_gap, + is_gap_direct=entry.is_gap_direct, + is_magnetic=entry.is_magnetic, + total_magnetization=entry.total_magnetization, + is_theoretical=entry.theoretical, ) # and save it to our database! structure_db.save() - # once all structures are saved, let's update the Thermodynamic columns - if update_stabilities: - MatprojStructure.update_all_stabilities() + # # once all structures are saved, let's update the Thermodynamic columns + # if update_stabilities: + # MatprojStructure.update_all_stabilities() + + +# This fix function below is a copy/paste from... +# https://stackoverflow.com/questions/18622007/ + +from django.conf import settings +from django.utils.timezone import make_aware + + +def fix_timezone(naive_datetime): + + settings.TIME_ZONE # 'UTC' + aware_datetime = make_aware(naive_datetime) + aware_datetime.tzinfo # + + return aware_datetime diff --git a/src/simmate/database/third_parties/materials_project.py b/src/simmate/database/third_parties/materials_project.py index 7faea1383..edcb03d01 100644 --- a/src/simmate/database/third_parties/materials_project.py +++ b/src/simmate/database/third_parties/materials_project.py @@ -16,7 +16,18 @@ class MatprojStructure(Structure, Thermodynamics): class Meta: app_label = "third_parties" - base_info = ["id", "structure_string", "energy"] + base_info = [ + "id", + "structure_string", + "energy", + "energy_uncorrected", + "band_gap", + "is_gap_direct", + "is_magnetic", + "total_magnetization", + "is_theoretical", + "updated_at", + ] source = "Materials Project" source_doi = "https://doi.org/10.1063/1.4812323" remote_archive_link = "https://archives.simmate.org/MatProjStructure-2022-01-26.zip" @@ -26,6 +37,44 @@ class Meta: The id used to represent the structure (ex: "mp-12345") """ + energy_uncorrected = table_column.FloatField(blank=True, null=True) + """ + The reported energy of the system BEFORE Materials Project applies + their composition-based corrections. + """ + + band_gap = table_column.FloatField(blank=True, null=True) + """ + The band gap energy in eV. + """ + + is_gap_direct = table_column.BooleanField(blank=True, null=True) + """ + Whether the band gap is direct or indirect. + """ + + is_magnetic = table_column.BooleanField(blank=True, null=True) + """ + Whether the material is magnetic + """ + + total_magnetization = table_column.FloatField(blank=True, null=True) + """ + The total magnetization of the material + """ + + is_theoretical = table_column.BooleanField(blank=True, null=True) + """ + Whether the material is from a theoretical structure. False indicates + that it is experimentally known. + """ + + updated_at = table_column.DateTimeField(blank=True, null=True) + """ + Timestamp of when this row was was lasted changed / updated by the + Materials Project + """ + @property def external_link(self) -> str: """ From ab2f7f711fdfd08f85175c509070ce33c9afa740 Mon Sep 17 00:00:00 2001 From: Jack Sundberg Date: Fri, 26 Aug 2022 19:15:23 -0400 Subject: [PATCH 2/3] checkpoint in mp --- .github/ISSUE_TEMPLATE/1-bug_report.yaml | 2 +- .github/ISSUE_TEMPLATE/2-feature_request.yaml | 13 +++----- .../for_providers/materials_project.py | 32 ++++++++++++++++--- 3 files changed, 32 insertions(+), 15 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1-bug_report.yaml b/.github/ISSUE_TEMPLATE/1-bug_report.yaml index 9c195f9ca..ba414ca28 100644 --- a/.github/ISSUE_TEMPLATE/1-bug_report.yaml +++ b/.github/ISSUE_TEMPLATE/1-bug_report.yaml @@ -13,7 +13,7 @@ body: If you have a question about getting started or how a feature works, please report your question via our [Discussions Page](https://github.com/jacksund/simmate/discussions/categories/q-a). - If you indeed have a bug, it helps to include things like the structure you used, the code, & input files while completing this form. For the code, try not to add long scripts if the error is only happening in one small part of it. Instead, try to generate your issue/error with as little code as possible or even using a illustrative example. + If you have a bug, it helps to include things like the structure you used, the code, & input files while completing this form. For the code, try not to add long scripts if the error is only happening in one small part of it. Instead, try to generate your issue/error with as little code as possible or even using a illustrative example. - type: textarea attributes: diff --git a/.github/ISSUE_TEMPLATE/2-feature_request.yaml b/.github/ISSUE_TEMPLATE/2-feature_request.yaml index cbc5f87f1..17bd8eae2 100644 --- a/.github/ISSUE_TEMPLATE/2-feature_request.yaml +++ b/.github/ISSUE_TEMPLATE/2-feature_request.yaml @@ -13,18 +13,13 @@ body: - type: textarea attributes: - label: Current behavior and/or alternatives - description: A clear and concise description of any alternative solutions or features you have considered. How would you currently do this? - - - type: textarea - attributes: - label: Example Use - description: Provide an example or description of how this would help users. + label: Additional context + description: Provide additional context about the use case here. - type: textarea attributes: - label: Additional context - description: Provide additional context about the use case here. + label: To-do items + description: (for dev team) List of changes required to implement this - type: markdown attributes: diff --git a/src/simmate/database/third_parties/for_providers/materials_project.py b/src/simmate/database/third_parties/for_providers/materials_project.py index f33204689..1800bd7a3 100644 --- a/src/simmate/database/third_parties/for_providers/materials_project.py +++ b/src/simmate/database/third_parties/for_providers/materials_project.py @@ -136,13 +136,35 @@ def load_all_structures( # a specific structure (so a single mp-id) # Note: this is a very large query, so make sure your computer has enough # memory (RAM >10GB) and a stable internet connection. - data = mpr.summary.search( + # data = mpr.summary.search( + # all_fields=False, + # fields=fields_to_load, + # deprecated=False, + # # !!! DEV NOTE: you can uncomment these lines for quick testing + # # num_chunks=3, + # chunk_size=100, + # ) + + # BUG: The search above is super unstable, so instad, I grab all mp-id + # in one search, then make individual queries for the data of each + # after that. + # This takes about 30 minutes. + mp_ids = mpr.summary.search( all_fields=False, - fields=fields_to_load, - # !!! DEV NOTE: you can uncomment these lines for quick testing - # num_chunks=3, - # chunk_size=250, + fields=["material_id"], + deprecated=False, + num_sites=(200, 1000), ) + data = [] + for entry in track(mp_ids): + result = mpr.summary.search( + material_ids=[entry.material_id], + all_fields=False, + fields=fields_to_load, + ) + data.append(result) + + # return data # Let's iterate through each structure and save it to the database # This also takes a while, so we use a progress bar From d9d2129955a9da8895978d51ca70c3cda807ab71 Mon Sep 17 00:00:00 2001 From: Jack Sundberg Date: Sat, 27 Aug 2022 11:33:05 -0400 Subject: [PATCH 3/3] switch to slower but stable queries --- CHANGELOG.md | 1 + .../for_providers/materials_project.py | 42 ++++++++++--------- .../third_parties/materials_project.py | 2 +- 3 files changed, 25 insertions(+), 20 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d6a003716..b8b1698ff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,6 +31,7 @@ There is one key exception to the rules above -- and that is with `MAJOR`=0 rele - account for structures in `fixed-composition` having fewer nsites than input becuase of symmetry reduction during relaxation. Also, add `min_structures_exact` parameter to ensure we have at least N structures with the expected number of sites - add experimental `variable-composition` (variable refers to nsites, not stoichiometry) and `binary-composition` evolutionary searches - allow custom workflows to run from yaml +- update MatProj data to new api, and add severl new columns for data (e.g. mag + band gap) **Refactors** - isolate optional dependencies so that our install is smaller diff --git a/src/simmate/database/third_parties/for_providers/materials_project.py b/src/simmate/database/third_parties/for_providers/materials_project.py index 1800bd7a3..08f9dd838 100644 --- a/src/simmate/database/third_parties/for_providers/materials_project.py +++ b/src/simmate/database/third_parties/for_providers/materials_project.py @@ -145,7 +145,7 @@ def load_all_structures( # chunk_size=100, # ) - # BUG: The search above is super unstable, so instad, I grab all mp-id + # BUG: The search above is super unstable, so instead, I grab all mp-id # in one search, then make individual queries for the data of each # after that. # This takes about 30 minutes. @@ -153,7 +153,6 @@ def load_all_structures( all_fields=False, fields=["material_id"], deprecated=False, - num_sites=(200, 1000), ) data = [] for entry in track(mp_ids): @@ -162,31 +161,36 @@ def load_all_structures( all_fields=False, fields=fields_to_load, ) - data.append(result) + data.append(result[0]) # return data # Let's iterate through each structure and save it to the database # This also takes a while, so we use a progress bar + failed_entries = [] for entry in track(data): + try: + # convert the data to a Simmate database object + structure_db = MatprojStructure.from_toolkit( + id=str(entry.material_id), + structure=entry.structure, + energy=entry.energy_per_atom * entry.structure.num_sites, + energy_uncorrected=entry.uncorrected_energy_per_atom + * entry.structure.num_sites, + updated_at=fix_timezone(entry.last_updated), + band_gap=entry.band_gap, + is_gap_direct=entry.is_gap_direct, + is_magnetic=entry.is_magnetic, + total_magnetization=entry.total_magnetization, + is_theoretical=entry.theoretical, + ) - # convert the data to a Simmate database object - structure_db = MatprojStructure.from_toolkit( - id=str(entry.material_id), - structure=entry.structure, - energy=entry.energy_per_atom * entry.structure.num_sites, - energy_uncorrected=entry.uncorrected_energy_per_atom - * entry.structure.num_sites, - updated_at=fix_timezone(entry.last_updated), - band_gap=entry.band_gap, - is_gap_direct=entry.is_gap_direct, - is_magnetic=entry.is_magnetic, - total_magnetization=entry.total_magnetization, - is_theoretical=entry.theoretical, - ) + # and save it to our database! + structure_db.save() + except: + failed_entries.append(entry) - # and save it to our database! - structure_db.save() + return failed_entries # # once all structures are saved, let's update the Thermodynamic columns # if update_stabilities: diff --git a/src/simmate/database/third_parties/materials_project.py b/src/simmate/database/third_parties/materials_project.py index edcb03d01..a6c8d0fbd 100644 --- a/src/simmate/database/third_parties/materials_project.py +++ b/src/simmate/database/third_parties/materials_project.py @@ -30,7 +30,7 @@ class Meta: ] source = "Materials Project" source_doi = "https://doi.org/10.1063/1.4812323" - remote_archive_link = "https://archives.simmate.org/MatProjStructure-2022-01-26.zip" + remote_archive_link = "https://archives.simmate.org/MatprojStructure-2022-08-27.zip" id = table_column.CharField(max_length=25, primary_key=True) """