LeMaterial · vict0rsch · Jun 27, 2025 · Jun 27, 2025 · Jun 27, 2025 · Jun 30, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,4 +4,9 @@ version = "0.1.0"
 description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.11"
-dependencies = []
+dependencies = [
+    "boto3>=1.38.45",
+    "mp-pyrho>=0.4.5",
+    "pymatgen>=2025.6.14",
+    "ruff>=0.12.1",
+]
diff --git a/scripts/aws-open-data.ipynb b/scripts/aws-open-data.ipynb
@@ -0,0 +1,118 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Accessing Data in LeMat-Rho AWS OpenData Repository\n",
+    "Data is stored in the following format:\n",
+    "/`immutable_ID`/`calc_type`/`VASP_file`.json.gz\n",
+    "\n",
+    "Here, the `immutable_ID` is similar to LeMat-Rho, in that it is the material ID respective of which database it came from, ie `mp-100` for Materials Project mp-100 material ID. The `calc_type` is the calculation type that was used to create the RAW vasp files (ie, `LeMatRhoStaticMaker` for the final static calculation). `VASP_file` is the vasp input/output file name such as `CHGCAR` or `AECCAR0` for example. It has been parsed using Pymatgen, saved as a json dictionary-like object and gzipped. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Downloading Data from AWS\n",
+    "\n",
+    "The following code below will download data from the AWS repository"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import boto3\n",
+    "from botocore import UNSIGNED\n",
+    "from botocore.config import Config\n",
+    "import gzip\n",
+    "import io\n",
+    "from pymatgen.io.vasp import Chgcar\n",
+    "import tempfile\n",
+    "\n",
+    "AWS_BUCKET_NAME = \"lemat-rho\"\n",
+    "FILE_PATH = 'mp-1/LeMatRhoStaticMaker/CHGCAR.gz'\n",
+    "\n",
+    "\n",
+    "def stream_gz_file_from_aws_bucket(s3_key, processor_cls):\n",
+    "    s3 = boto3.client(\n",
+    "        \"s3\", region_name=\"us-west-2\", config=Config(signature_version=UNSIGNED)\n",
+    "    )\n",
+    "    response = s3.get_object(Bucket=AWS_BUCKET_NAME, Key=s3_key)\n",
+    "\n",
+    "    # Stream and decompress using GzipFile\n",
+    "    gzipped_body = gzip.GzipFile(fileobj=response[\"Body\"])\n",
+    "\n",
+    "    # If needed, wrap in BufferedReader to make it seekable\n",
+    "    buffered_reader = io.BufferedReader(gzipped_body)\n",
+    "\n",
+    "    # Pass to processor\n",
+    "    processor = processor_cls(buffered_reader)\n",
+    "    return processor.process()\n",
+    "\n",
+    "\n",
+    "class ChgCarProcessor:\n",
+    "    def __init__(self, file_obj):\n",
+    "        self.file_obj = file_obj\n",
+    "\n",
+    "    def process(self):\n",
+    "        with tempfile.TemporaryFile(mode=\"w+b\") as tf:\n",
+    "            for chunk in iter(lambda: self.file_obj.read(1024 * 1024), b\"\"):\n",
+    "                tf.write(chunk)\n",
+    "            tf.seek(0)\n",
+    "\n",
+    "            self.chgcar = Chgcar.from_file(\n",
+    "                tf\n",
+    "            )\n",
+    "        return self.chgcar\n",
+    "\n",
+    "chgcar = stream_gz_file_from_aws_bucket(\n",
+    "    s3_key=FILE_PATH, processor_cls=ChgCarProcessor)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Using PyRho from Materials Project to convert/process charge density files\n",
+    "In the database on HuggingFace, we use [pyRho](https://materialsproject.github.io/pyrho/) to process the raw charge densities and upscale them to be similar in sizes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pyrho.charge_density import ChargeDensity\n",
+    "charge_density = ChargeDensity.from_pmg(chgcar)\n",
+    "upscaled_density = charge_density.pgrids['total'].lossy_smooth_compression([300,300,300])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/scripts/process_aws_to_hf.py b/scripts/process_aws_to_hf.py
@@ -0,0 +1,269 @@
+"""
+HF DataFrame Rows
+
+| ID | LeMat-Bulk ID | BAWL Hash | Functional | Lattice Vectors |
+| Species at Sites | Cartesian Site Positions | Normalized charge density |
+| Normalized AECCAR0 | Normalized AECCAR1 | Normalized AECCAR2 |
+| Bader Charge Partition | DDEC6 Charge Partition |
+"""
+
+import boto3
+import gzip
+import io
+import os
+import pandas as pd
+
+from pymatgen.io.vasp import Chgcar
+from pymatgen.core import Structure
+from pymatgen.command_line.bader_caller import BaderAnalysis
+from pymatgen.command_line.chargemol_caller import ChargemolAnalysis
+
+from monty.tempfile import ScratchDir
+import numpy as np
+from datasets import Dataset
+
+from pyrho.charge_density import ChargeDensity
+from material_hasher.hasher.bawl import BAWLHasher
+
+from pymatgen.io.vasp import Vasprun
+import subprocess
+import sys
+from pymatgen.io.vasp.sets import MatPESStaticSet
+
+
+AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
+AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
+AWS_BUCKET_NAME = "lemat-rho"
+
+PERL_CHGCARSUM_FILE = "/Users/martinsiron/Downloads/vtstscripts-1034/chgsum.pl"
+BADER_PATH = "/Users/martinsiron/Downloads/bader_osx"  # bader executable
+
+ATOMIC_DENSITIES_PATH = "/Users/martinsiron/Downloads/chargemol_09_26_2017/atomic_densities"
+
+
+def pymatgen_to_optimade(pmg_structure: Structure):
+    data = {}
+    data["elements"] = pmg_structure.chemical_system_set
+    data["nsites"] = len(pmg_structure)
+    data["chemical_formula_anonymous"] = pmg_structure.composition.anonymized_formula
+    data["chemical_formula_reduced"] = (
+        pmg_structure.composition.reduced_composition.to_pretty_string()
+    )
+    data["chemical_formula_descriptive"] = pmg_structure.composition.to_pretty_string()
+    data["nelements"] = len(pmg_structure.chemical_system_set)
+    data["dimension_types"] = [1, 1, 1]
+    data["nperiodic_dimensions"] = 3
+    data["lattice_vectors"] = pmg_structure.lattice.matrix
+    data["cartesian_site_positions"] = pmg_structure.cart_coords
+    data["species_at_sites"] = [x.species.elements[0].name for x in pmg_structure]
+    return data
+
+
+def push_dataframe_to_hf_dataset(
+    df,
+    repo_id: str,
+    split: str = "train",
+    private: bool = False,
+    token: str | None = None,
+):
+    """Push a pandas DataFrame to a Hugging Face dataset repository.
+
+    - Converts numpy arrays in cells (e.g., 3D grids) to nested Python lists.
+    - Converts numpy scalar types to native Python types.
+    - Creates a `datasets.Dataset` and pushes it to the Hub.
+    """
+
+    def to_serializable(value):
+        if isinstance(value, np.ndarray):
+            # Use float32 to reduce size; convert to nested lists
+            if value.dtype != np.float32:
+                value = value.astype(np.float32)
+            return value.tolist()
+        if isinstance(value, (np.floating,)):
+            return float(value)
+        if isinstance(value, (np.integer,)):
+            return int(value)
+        return value
+
+    df_serializable = df.applymap(to_serializable)
+    dataset = Dataset.from_pandas(df_serializable, preserve_index=False)
+    dataset.push_to_hub(
+        repo_id=repo_id,
+        split=split,
+        private=private,
+        token=token or os.getenv("HF_TOKEN"),
+    )
+    return dataset
+
+
+def stream_gz_file_from_aws_bucket(s3_key, processor_cls, **processor_kwargs):
+    s3 = boto3.client(
+        "s3",
+        aws_access_key_id=AWS_ACCESS_KEY_ID,
+        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
+    )
+    response = s3.get_object(Bucket=AWS_BUCKET_NAME, Key=s3_key)
+
+    # Stream and decompress using GzipFile
+    gzipped_body = gzip.GzipFile(fileobj=response["Body"])
+
+    # If needed, wrap in BufferedReader to make it seekable
+    buffered_reader = io.BufferedReader(gzipped_body)
+
+    # Pass to processor
+    processor = processor_cls(buffered_reader, **processor_kwargs)
+
+    # Do not call processor.process() here, just return the processor
+    return processor
+
+
+# calc_type = 'LeMatRhoRelaxMaker_1'
+def stream_gz_folder_from_aws_bucket(
+    folder, files, calculation_type="LeMatRhoStaticMaker"
+):
+    s3 = boto3.client(
+        "s3",
+        aws_access_key_id=AWS_ACCESS_KEY_ID,
+        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
+    )
+    for file in files:
+        response = s3.get_object(
+            Bucket=AWS_BUCKET_NAME, Key=os.path.join(folder, calculation_type, file)
+        )
+        gzipped_body = gzip.GzipFile(fileobj=response["Body"])
+        buffered_reader = io.BufferedReader(gzipped_body)
+        with open(file.split(".")[0], "w") as fh:
+            fh.write(buffered_reader.read().decode("utf-8"))
+
+
+class CubeProcessor:
+    def __init__(
+        self,
+        file_name,
+        cube_class,
+        key,
+        pgrid_key="total",
+        compression_shape=[200, 200, 200],
+    ):
+        self.cube_class = cube_class
+        self.file_name = file_name
+        self.pgrid_key = pgrid_key
+        self.compression_shape = compression_shape
+        self._hf_data = {}
+        # Files in our bucket are gzipped CHGCAR-like text files
+
+        self.cube_obj = cube_class.from_file(file_name)
+        self.key = key
+
+    def process(self):
+        density = ChargeDensity.from_pmg(self.cube_obj)
+        pgrid = density.pgrids[self.pgrid_key]
+        self._hf_data.update(
+            {
+                f"{self.key}": density.pgrids[self.pgrid_key].grid_data,
+                "compressed_charge_density": (
+                    pgrid.lossy_smooth_compression(self.compression_shape)
+                ),
+            }
+        )
+
+    @property
+    def grid_3d(self):
+        density = ChargeDensity.from_pmg(self.cube_obj)
+        return density.pgrids[self.pgrid_key].grid_data
+
+
+class ChgCarProcessor(CubeProcessor):
+    def __init__(
+        self,
+        file_name,
+        key="charge_density",
+        cube_class=Chgcar,
+        pgrid_key="total",
+        compression_shape=[15, 15, 15],
+    ):
+        super().__init__(
+            file_name,
+            cube_class=cube_class,
+            key=key,
+            pgrid_key=pgrid_key,
+            compression_shape=compression_shape,
+        )
+
+
+if __name__ == "__main__":
+    bh = BAWLHasher()
+    # get list of all folders in bucket using secret key
+    material_ids = []
+    s3 = boto3.client(
+        "s3",
+        aws_access_key_id=AWS_ACCESS_KEY_ID,
+        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
+    )
+    response = s3.list_objects_v2(Bucket=AWS_BUCKET_NAME, Delimiter="/")
+    if "CommonPrefixes" in response:
+        for prefix in response["CommonPrefixes"]:
+            if (
+                "oqmd-" in prefix["Prefix"]
+                or "mp-" in prefix["Prefix"]
+                or "agm" in prefix["Prefix"]
+            ):
+                material_ids.append(prefix["Prefix"][:-1])
+
+    data = []
+    for material_id in material_ids[:1]:
+                # with ScratchDir('.', ) as sd:
+                print(f"processing {material_id}")
+                row = {}
+                row.update({"immutable_id": material_id})
+                # Download files
+
+                stream_gz_folder_from_aws_bucket(
+                    material_id, ["vasprun.xml.gz"], calculation_type="LeMatRhoRelaxMaker_1"
+                )
+                stream_gz_folder_from_aws_bucket(
+                    material_id,
+                    ["CHGCAR.gz", "AECCAR0.gz", "AECCAR1.gz", "AECCAR2.gz"],
+                )
+
+                # Process CHGCAR:
+                chgcar = ChgCarProcessor("CHGCAR", cube_class=Chgcar)
+                chgcar.process()
+                row["bawl_hash"] = bh.get_material_hash(chgcar.cube_obj.structure)
+                row.update(pymatgen_to_optimade(chgcar.cube_obj.structure))
+                row["compressed_charge_density"] = chgcar.grid_3d
+
+                aeccar0 = ChgCarProcessor("AECCAR0", cube_class=Chgcar)
+                aeccar0.process()
+                row["compressed_aeccar0_density"] = chgcar.grid_3d
+
+                aeccar1 = ChgCarProcessor("AECCAR1", cube_class=Chgcar)
+                aeccar1.process()
+                row["compressed_aeccar1_density"] = aeccar1.grid_3d
+
+                aeccar2 = ChgCarProcessor("AECCAR2", cube_class=Chgcar)
+                aeccar2.process()
+                row["compressed_aeccar2_density"] = aeccar2.grid_3d
+
+                ## Bader Charge Partitioning
+                MatPESStaticSet(Vasprun("vasprun").structures[0]).potcar.write_file("POTCAR")
+                params = ["AECCAR0", "AECCAR2"]
+                perl_script = subprocess.Popen(
+                    [PERL_CHGCARSUM_FILE, *params], stdout=sys.stdout
+                )
+                perl_script.communicate()
+                ba = BaderAnalysis(
+                    os.path.join(os.getcwd(), "CHGCAR"),
+                    os.path.join(os.getcwd(), "POTCAR"),
+                    chgref_filename=os.path.join(os.getcwd(), "CHGCAR_sum"),
+                    bader_path=BADER_PATH,
+                )
+                row["bader_charges"] = ba.get_charge_decorated_structure().site_properties[
+                    "charge"
+                ]
+                row["bader_atomic_volume"] = ba.summary['atomic_volume']
+
+                data.append(row)
+
+    # df = pd.DataFrame(data)
+    # push_dataframe_to_hf_dataset(df, "lematerial/LeMat-Rho", private=True)