Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,9 @@ version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.11"
dependencies = []
dependencies = [
"boto3>=1.38.45",
"mp-pyrho>=0.4.5",
"pymatgen>=2025.6.14",
"ruff>=0.12.1",
]
118 changes: 118 additions & 0 deletions scripts/aws-open-data.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Accessing Data in LeMat-Rho AWS OpenData Repository\n",
"Data is stored in the following format:\n",
"/`immutable_ID`/`calc_type`/`VASP_file`.json.gz\n",
"\n",
"Here, the `immutable_ID` is similar to LeMat-Rho, in that it is the material ID respective of which database it came from, ie `mp-100` for Materials Project mp-100 material ID. The `calc_type` is the calculation type that was used to create the RAW vasp files (ie, `LeMatRhoStaticMaker` for the final static calculation). `VASP_file` is the vasp input/output file name such as `CHGCAR` or `AECCAR0` for example. It has been parsed using Pymatgen, saved as a json dictionary-like object and gzipped. "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Downloading Data from AWS\n",
"\n",
"The following code below will download data from the AWS repository"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import boto3\n",
"from botocore import UNSIGNED\n",
"from botocore.config import Config\n",
"import gzip\n",
"import io\n",
"from pymatgen.io.vasp import Chgcar\n",
"import tempfile\n",
"\n",
"AWS_BUCKET_NAME = \"lemat-rho\"\n",
"FILE_PATH = 'mp-1/LeMatRhoStaticMaker/CHGCAR.gz'\n",
"\n",
"\n",
"def stream_gz_file_from_aws_bucket(s3_key, processor_cls):\n",
" s3 = boto3.client(\n",
" \"s3\", region_name=\"us-west-2\", config=Config(signature_version=UNSIGNED)\n",
" )\n",
" response = s3.get_object(Bucket=AWS_BUCKET_NAME, Key=s3_key)\n",
"\n",
" # Stream and decompress using GzipFile\n",
" gzipped_body = gzip.GzipFile(fileobj=response[\"Body\"])\n",
"\n",
" # If needed, wrap in BufferedReader to make it seekable\n",
" buffered_reader = io.BufferedReader(gzipped_body)\n",
"\n",
" # Pass to processor\n",
" processor = processor_cls(buffered_reader)\n",
" return processor.process()\n",
"\n",
"\n",
"class ChgCarProcessor:\n",
" def __init__(self, file_obj):\n",
" self.file_obj = file_obj\n",
"\n",
" def process(self):\n",
" with tempfile.TemporaryFile(mode=\"w+b\") as tf:\n",
" for chunk in iter(lambda: self.file_obj.read(1024 * 1024), b\"\"):\n",
" tf.write(chunk)\n",
" tf.seek(0)\n",
"\n",
" self.chgcar = Chgcar.from_file(\n",
" tf\n",
" )\n",
" return self.chgcar\n",
"\n",
"chgcar = stream_gz_file_from_aws_bucket(\n",
" s3_key=FILE_PATH, processor_cls=ChgCarProcessor)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Using PyRho from Materials Project to convert/process charge density files\n",
"In the database on HuggingFace, we use [pyRho](https://materialsproject.github.io/pyrho/) to process the raw charge densities and upscale them to be similar in sizes"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from pyrho.charge_density import ChargeDensity\n",
"charge_density = ChargeDensity.from_pmg(chgcar)\n",
"upscaled_density = charge_density.pgrids['total'].lossy_smooth_compression([300,300,300])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
269 changes: 269 additions & 0 deletions scripts/process_aws_to_hf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,269 @@
"""
HF DataFrame Rows

| ID | LeMat-Bulk ID | BAWL Hash | Functional | Lattice Vectors |
| Species at Sites | Cartesian Site Positions | Normalized charge density |
| Normalized AECCAR0 | Normalized AECCAR1 | Normalized AECCAR2 |
| Bader Charge Partition | DDEC6 Charge Partition |
"""

import boto3
import gzip
import io
import os
import pandas as pd

from pymatgen.io.vasp import Chgcar
from pymatgen.core import Structure
from pymatgen.command_line.bader_caller import BaderAnalysis
from pymatgen.command_line.chargemol_caller import ChargemolAnalysis

from monty.tempfile import ScratchDir
import numpy as np
from datasets import Dataset

from pyrho.charge_density import ChargeDensity
from material_hasher.hasher.bawl import BAWLHasher

from pymatgen.io.vasp import Vasprun
import subprocess
import sys
from pymatgen.io.vasp.sets import MatPESStaticSet


AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
AWS_BUCKET_NAME = "lemat-rho"

PERL_CHGCARSUM_FILE = "/Users/martinsiron/Downloads/vtstscripts-1034/chgsum.pl"
BADER_PATH = "/Users/martinsiron/Downloads/bader_osx" # bader executable

ATOMIC_DENSITIES_PATH = "/Users/martinsiron/Downloads/chargemol_09_26_2017/atomic_densities"


def pymatgen_to_optimade(pmg_structure: Structure):
data = {}
data["elements"] = pmg_structure.chemical_system_set
data["nsites"] = len(pmg_structure)
data["chemical_formula_anonymous"] = pmg_structure.composition.anonymized_formula
data["chemical_formula_reduced"] = (
pmg_structure.composition.reduced_composition.to_pretty_string()
)
data["chemical_formula_descriptive"] = pmg_structure.composition.to_pretty_string()
data["nelements"] = len(pmg_structure.chemical_system_set)
data["dimension_types"] = [1, 1, 1]
data["nperiodic_dimensions"] = 3
data["lattice_vectors"] = pmg_structure.lattice.matrix
data["cartesian_site_positions"] = pmg_structure.cart_coords
data["species_at_sites"] = [x.species.elements[0].name for x in pmg_structure]
return data


def push_dataframe_to_hf_dataset(
df,
repo_id: str,
split: str = "train",
private: bool = False,
token: str | None = None,
):
"""Push a pandas DataFrame to a Hugging Face dataset repository.

- Converts numpy arrays in cells (e.g., 3D grids) to nested Python lists.
- Converts numpy scalar types to native Python types.
- Creates a `datasets.Dataset` and pushes it to the Hub.
"""

def to_serializable(value):
if isinstance(value, np.ndarray):
# Use float32 to reduce size; convert to nested lists
if value.dtype != np.float32:
value = value.astype(np.float32)
return value.tolist()
if isinstance(value, (np.floating,)):
return float(value)
if isinstance(value, (np.integer,)):
return int(value)
return value

df_serializable = df.applymap(to_serializable)
dataset = Dataset.from_pandas(df_serializable, preserve_index=False)
dataset.push_to_hub(
repo_id=repo_id,
split=split,
private=private,
token=token or os.getenv("HF_TOKEN"),
)
return dataset


def stream_gz_file_from_aws_bucket(s3_key, processor_cls, **processor_kwargs):
s3 = boto3.client(
"s3",
aws_access_key_id=AWS_ACCESS_KEY_ID,
aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
)
response = s3.get_object(Bucket=AWS_BUCKET_NAME, Key=s3_key)

# Stream and decompress using GzipFile
gzipped_body = gzip.GzipFile(fileobj=response["Body"])

# If needed, wrap in BufferedReader to make it seekable
buffered_reader = io.BufferedReader(gzipped_body)

# Pass to processor
processor = processor_cls(buffered_reader, **processor_kwargs)

# Do not call processor.process() here, just return the processor
return processor


# calc_type = 'LeMatRhoRelaxMaker_1'
def stream_gz_folder_from_aws_bucket(
folder, files, calculation_type="LeMatRhoStaticMaker"
):
s3 = boto3.client(
"s3",
aws_access_key_id=AWS_ACCESS_KEY_ID,
aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
)
for file in files:
response = s3.get_object(
Bucket=AWS_BUCKET_NAME, Key=os.path.join(folder, calculation_type, file)
)
gzipped_body = gzip.GzipFile(fileobj=response["Body"])
buffered_reader = io.BufferedReader(gzipped_body)
with open(file.split(".")[0], "w") as fh:
fh.write(buffered_reader.read().decode("utf-8"))


class CubeProcessor:
def __init__(
self,
file_name,
cube_class,
key,
pgrid_key="total",
compression_shape=[200, 200, 200],
):
self.cube_class = cube_class
self.file_name = file_name
self.pgrid_key = pgrid_key
self.compression_shape = compression_shape
self._hf_data = {}
# Files in our bucket are gzipped CHGCAR-like text files

self.cube_obj = cube_class.from_file(file_name)
self.key = key

def process(self):
density = ChargeDensity.from_pmg(self.cube_obj)
pgrid = density.pgrids[self.pgrid_key]
self._hf_data.update(
{
f"{self.key}": density.pgrids[self.pgrid_key].grid_data,
"compressed_charge_density": (
pgrid.lossy_smooth_compression(self.compression_shape)
),
}
)

@property
def grid_3d(self):
density = ChargeDensity.from_pmg(self.cube_obj)
return density.pgrids[self.pgrid_key].grid_data


class ChgCarProcessor(CubeProcessor):
def __init__(
self,
file_name,
key="charge_density",
cube_class=Chgcar,
pgrid_key="total",
compression_shape=[15, 15, 15],
):
super().__init__(
file_name,
cube_class=cube_class,
key=key,
pgrid_key=pgrid_key,
compression_shape=compression_shape,
)


if __name__ == "__main__":
bh = BAWLHasher()
# get list of all folders in bucket using secret key
material_ids = []
s3 = boto3.client(
"s3",
aws_access_key_id=AWS_ACCESS_KEY_ID,
aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
)
response = s3.list_objects_v2(Bucket=AWS_BUCKET_NAME, Delimiter="/")
if "CommonPrefixes" in response:
for prefix in response["CommonPrefixes"]:
if (
"oqmd-" in prefix["Prefix"]
or "mp-" in prefix["Prefix"]
or "agm" in prefix["Prefix"]
):
material_ids.append(prefix["Prefix"][:-1])

data = []
for material_id in material_ids[:1]:
# with ScratchDir('.', ) as sd:
print(f"processing {material_id}")
row = {}
row.update({"immutable_id": material_id})
# Download files

stream_gz_folder_from_aws_bucket(
material_id, ["vasprun.xml.gz"], calculation_type="LeMatRhoRelaxMaker_1"
)
stream_gz_folder_from_aws_bucket(
material_id,
["CHGCAR.gz", "AECCAR0.gz", "AECCAR1.gz", "AECCAR2.gz"],
)

# Process CHGCAR:
chgcar = ChgCarProcessor("CHGCAR", cube_class=Chgcar)
chgcar.process()
row["bawl_hash"] = bh.get_material_hash(chgcar.cube_obj.structure)
row.update(pymatgen_to_optimade(chgcar.cube_obj.structure))
row["compressed_charge_density"] = chgcar.grid_3d

aeccar0 = ChgCarProcessor("AECCAR0", cube_class=Chgcar)
aeccar0.process()
row["compressed_aeccar0_density"] = chgcar.grid_3d

aeccar1 = ChgCarProcessor("AECCAR1", cube_class=Chgcar)
aeccar1.process()
row["compressed_aeccar1_density"] = aeccar1.grid_3d

aeccar2 = ChgCarProcessor("AECCAR2", cube_class=Chgcar)
aeccar2.process()
row["compressed_aeccar2_density"] = aeccar2.grid_3d

## Bader Charge Partitioning
MatPESStaticSet(Vasprun("vasprun").structures[0]).potcar.write_file("POTCAR")
params = ["AECCAR0", "AECCAR2"]
perl_script = subprocess.Popen(
[PERL_CHGCARSUM_FILE, *params], stdout=sys.stdout
)
perl_script.communicate()
ba = BaderAnalysis(
os.path.join(os.getcwd(), "CHGCAR"),
os.path.join(os.getcwd(), "POTCAR"),
chgref_filename=os.path.join(os.getcwd(), "CHGCAR_sum"),
bader_path=BADER_PATH,
)
row["bader_charges"] = ba.get_charge_decorated_structure().site_properties[
"charge"
]
row["bader_atomic_volume"] = ba.summary['atomic_volume']

data.append(row)

# df = pd.DataFrame(data)
# push_dataframe_to_hf_dataset(df, "lematerial/LeMat-Rho", private=True)
Loading