diff --git a/404.html b/404.html index 0b2d045..a36352c 100644 --- a/404.html +++ b/404.html @@ -40,6 +40,8 @@ + + @@ -347,6 +349,18 @@

404 - Not found

+ + + + + + + + + + + + \ No newline at end of file diff --git a/examples/index.html b/examples/index.html index 62bd026..2503c6c 100644 --- a/examples/index.html +++ b/examples/index.html @@ -42,6 +42,8 @@ + + @@ -818,6 +820,18 @@

Online lea + + + + + + + + + + + + \ No newline at end of file diff --git a/execution/index.html b/execution/index.html index e8bfc2e..d51f103 100644 --- a/execution/index.html +++ b/execution/index.html @@ -44,6 +44,8 @@ + + @@ -669,6 +671,18 @@

3. Putting it all together: + + + + + + + + + + + + \ No newline at end of file diff --git a/index.html b/index.html index 99088f7..20156c0 100644 --- a/index.html +++ b/index.html @@ -42,6 +42,8 @@ + + @@ -455,6 +457,18 @@

psiflow + + + + + + + + + + + + \ No newline at end of file diff --git a/overview/index.html b/overview/index.html index 1cb389f..d9c70f2 100644 --- a/overview/index.html +++ b/overview/index.html @@ -44,6 +44,8 @@ + + @@ -463,6 +465,8 @@

Atomic data

data = Dataset.load('lots_of_data.xyz') train, valid = data.shuffle().split(0.9) # shuffle structures and partition into train/valid sets +type(train) # psiflow Dataset +type(valid) # psiflow Dataset
The main difference between a psiflow Dataset instance and an actual Python list of Atoms is that a Dataset can represent data that will be generated in the future.

@@ -552,25 +556,23 @@

Trainable potentials

config.num_features = 16 # modify NequIP parameters to whatever model = NequIPModel(config) # create model instance - -# initialize, train, deploy -model.initialize(data_train) # this will calculate the scale/shifts, and average number of neighbors -model.train(data_train, data_valid) # train using supplied datasets - -model.save('./') # saves initialized config and model to current working directory! - - -# evaluate test error -data_test = Dataset.load('test.xyz') # test data; contains QM reference energy/forces/stress -data_test_model = model.evaluate(data_test) # same test data, but with predicted energy/forces/stress - -errors = Dataset.get_errors( # static method of Dataset to compute the error between two datasets - data_test, - data_test_model, - properties=['forces'], # only compute the force error - elements=['C', 'O'], # only include carbon or oxygen atoms - metric='rmse', # use RMSE instead of MAE or MAX - ).result() # errors is an AppFuture, use .result() to get the actual values as ndarray +# initialize, train, deploy +model.initialize(data_train) # this will calculate the scale/shifts, and average number of neighbors +model.train(data_train, data_valid) # train using supplied datasets + +model.save('./') # saves initialized config and model to current working directory! + +# evaluate test error +data_test = Dataset.load('test.xyz') # test data; contains QM reference energy/forces/stress +data_test_model = model.evaluate(data_test) # same test data, but with predicted energy/forces/stress + +errors = Dataset.get_errors( # static method of Dataset to compute the error between two datasets + data_test, + data_test_model, + properties=['forces'], # only compute the force error + elements=['C', 'O'], # only include carbon or oxygen atoms + metric='rmse', # use RMSE instead of MAE or MAX + ).result() # errors is an AppFuture, use .result() to get the actual values as ndarray Note that depending on how the psiflow execution is configured, it is perfectly possible @@ -580,17 +582,24 @@

Trainable potentials

See the psiflow Configuration page for more information.

In many cases, it is generally recommended to provide these models with some estimate of the absolute energy of an isolated atom for the specific level of theory and basis set considered (and this for each element). -Instead of having the model learn the absolute total energy of the system, we first subtract the atomic energies in order +Instead of having the model learn the absolute total energy of the system, we first subtract these atomic energies in order to train the model on the formation energy of the system instead, as this generally improves the generalization performance of the model towards unseen stoichiometries.

-
model.add_atomic_energy('H', -13.7)     # add atomic energy of isolated hydrogen atom
-model.initialize(some_training_data)
+

model.add_atomic_energy('H', -13.7)     # add atomic energy of isolated hydrogen atom
+model.initialize(some_training_data)
 
-model.add_atomic_energy('O', -400)      # will raise an exception; model needs to be reinitialized first
-model.reset()                           # removes current model, but keeps raw config
-model.add_atomic_energy('O', -400)
-model.initialize(some_training_data)    # OK!
+model.add_atomic_energy('O', -400)      # will raise an exception; model needs to be reinitialized first
+model.reset()                           # removes current model, but keeps raw config
+model.add_atomic_energy('O', -400)      # OK!
+model.initialize(some_training_data)    # offsets total energy with given atomic energy values per atom
 
+Whenever atomic energies are available, BaseModel instances will automatically offset the potential energy in a (labeled) +Dataset by the sum of the energies of the isolated atoms; the underlying PyTorch network is then initialized/trained +on the formation energy of the system instead. +In order to avoid artificially high energy discrepancies between models trained on the formation energy on one hand, +and reference potential energies as obtained from any BaseReference, +the evaluate method will first perform the converse operation, i.e. add the energies of the isolated atoms +to the model's prediction of the formation energy.

Molecular simulation

Having trained a model, it is possible to explore the phase space of a physical system in order to generate new geometries. @@ -604,8 +613,11 @@

Molecular simulation

Temperature and pressure control are implemented by means of stochastic Langevin dynamics because it typically dampens the correlations -as compared to deterministic time propagation methods such as Nose-Hoover.

-

import numpy as np
+as compared to deterministic time propagation methods based on extended Lagrangians (e.g. Nose-Hoover).
+Propagation of a walker will return a metadata
+namedtuple
+which has multiple fields, some of which are specific to the type of the walker.

+
import numpy as np
 from psiflow.sampling import DynamicWalker
 
 
@@ -623,18 +635,35 @@ 

Molecular simulation

# run short MD simulation using some model) metadata = walker.propagate(model=model)
-Propagation of walkers returns a metadata namedtuple which has multiple fields:

-

metadata.state              # parsl AppFuture of a FlowAtoms object which represents the final state 
+

The following fields are always present in the metadata object:

+
    +
  • metadata.state: AppFuture of a FlowAtoms object which represents the final state
  • +
  • metadata.counter: AppFuture of an int representing the total number of steps +that the walker has taken since its initialization (or most recent reset).
  • +
  • metadata.reset: AppFuture of a bool which indicates whether the walker was reset +during or after propagation (e.g. because the temperature diverged too far from its target value).
  • +
+

The dynamic walker in particular has a few additional fields which might be useful:

+
    +
  • metadata.temperature: AppFuture of a float representing the average temperature +during the simulation
  • +
  • metadata.stdout: filepath of the output log of the molecular dynamics run
  • +
  • metadata.time: AppFuture of a float which represents the total +elapsed time during propagation.
  • +
+

When doing active learning, we're usually only interested in the final state of each of the walkers +and whether the average temperature remained within reasonable bounds. +In that case, the returned metadata object contains all the necessary information about +the propagation. +However, the actual trajectory that the walker has followed can be optionally returned as +a Dataset:

+
metadata, trajectory = walker.propagate(model=model, keep_trajectory=True)
+assert trajectory.length().result == (1000 / 100 + 1)   # includes initial and final state
+assert np.allclose(                                     # metadata contains final state
+        metadata.state.result().get_positions(),
+        trajectory[-1].result().get_positions(),
+        )
 
-The state that is returned by the propagation is a Future of an ASE Atoms -that represents the final state of the walker after the simulation. -If the simulation proceeds without errors, the following assertion -will evaluate to True: -
assert np.allclose(
-        state.result().positions,           # state is a Future; use .result() to get actual Atoms
-        trajectory[-1].result().positions,  # trajectory is a Dataset; use regular indexing and .result()
-        )
-

Parsl 102: Futures are necessary

This example should also illustrate why exactly we would represent data using @@ -649,8 +678,8 @@

Molecular simulation

For example, if we would like to find out how many states were actually generated, we'd use the dataset.length() function that returns a Future of the length of the dataset: -
length = trajectory.length()    # will execute before the trajectory is actually generated
-length.result()                 # enforces Python to wait until the MD calculation has finished, and then compute the actual length
+
length = trajectory.length()    # will execute before the trajectory is actually generated
+length.result()                 # enforces Python to wait until the MD calculation has finished, and then compute the actual length
 
See this page in the Parsl documentation for more information on asynchronous execution.

@@ -668,11 +697,27 @@

Molecular simulation

are either impossible to evaluate with the given reference (e.g. due to SCF convergence issues) or do not contain any relevant information on the atomic interactions. -To catch such events, psiflow checks the maximum force that each atom experiences -in every timestep, and if it exceeds a certain threshold value (40 eV/A by default), -propagation is halted and the last known state before the threshold was exceeded -is returned. -By default, the walker will reset its internal state to the starting configuration +While there exist a bunch of techniques in literature in order to check for such divergences, +psiflow takes a pragmatic approach and simply monitors the temperature of the walkers. +Statistical mechanics provides an exact expression for the distribution of the instantaneous +temperature of the system as a function of the number of atoms N and the temperature +of the heat bath T: +$$ +3N\frac{T_i}{T} \sim \chi^2(3N) +$$ +in which the chi-squared distribution +arises because the temperature (i.e. kinetic energy) is essentially equal to the sum of +the squares of 3N normally distributed velocity components. +Based on the inverse cumulative distribution function and a fixed p-value, we can +derive a threshold temperature such that: +$$ +P\left[T_i > T_{\text{thres}}\right] = 1 - p +$$ +For example, for a system of 100 atoms in equilibrium at 300 K, we obtain a threshold temperature of +about 360 K for p = 10-2, and about 400 K for p = 10-4. +If the final simulation temperature exceeds the threshold at the last step of the MD simulation +(or model evaluation yielded NaN or ValueError at any point throughout the propagation), +the walker will reset its internal state to the starting configuration in order to make sure that subsequent propagations again start from a physically sensible structure.

In practical scenarios, phase space exploration is often performed in a massively @@ -682,28 +727,29 @@

Molecular simulation

configuration and their random number seed. Let us try and generate 10 walkers which are initialized with different snapshots from the trajectory obtained before:

-
walkers = DynamicWalker.multiply(
-        10,
-        data_start=trajectory,              # walker i initialized to trajectory[i]
-        temperature=300,
-        steps=100,
-        )
-for i, walker in enumerate(walkers):
-    assert walker.seed == i                 # unique seed for each walker
-
-states = []                                 # keep track of 'Future' states
-for walker in walkers:
-    state = walker.propagate(model=model)   # proceeds in parallel!
-    states.append(state)
-data = Dataset(states)                      # put them in a Dataset
+
walkers = DynamicWalker.multiply(
+        10,
+        data_start=trajectory,              # walker i initialized to trajectory[i]
+        temperature=300,
+        steps=100,
+        )
+for i, walker in enumerate(walkers):
+    assert walker.seed == i                 # unique seed for each walker
+
+states = []                                 # keep track of 'Future' states
+for walker in walkers:
+    metadata = walker.propagate(model=model)   # proceeds in parallel!
+    states.append(metadata.state)
+data = Dataset(states)                      # put them in a Dataset
 
-

Geometry optimization is performed using the +

Besides the dynamic walker, we also implemented an OptimizationWalker which +wraps around ASE's preconditioned L-BFGS implementation -in ASE, as it typically requires less steps than either conventional L-BFGS or -first-order methods such as CG. -Note that geometry optimizations in psiflow will not -reduce the total residual force on the system beyond about 0.01 eV/A -because all model evaluations are performed in single precision (float32),.

+; this is an efficient optimization algorithm which typically requires less steps than either conventional L-BFGS +or first-order methods such as conjugate gradient (CG). +Note that geometry optimizations in psiflow will generally +not be able to reduce the total residual force on the system beyond about 0.01 eV/A +because of the relatively limited precision (float32) of model evaluation.

Bias potentials and enhanced sampling

In the vast majority of molecular dynamics simulations of realistic systems, it is beneficial to modify the equilibrium Boltzmann distribution with bias potentials @@ -727,18 +773,18 @@

Bias potentials and enhanced samp To apply this bias in a simulation, we employ the BiasedDynamicWalker; it is almost identical to the DynamicWalker except that it accepts an additional (mandatory) bias keyword argument during initialization: -
from psiflow.sampling import BiasedDynamicWalker, PlumedBias
-
-
-plumed_input = """
-UNITS LENGTH=A ENERGY=kj/mol TIME=fs
-CV: VOLUME
-METAD ARG=CV SIGMA=100 HEIGHT=2 PACE=10 LABEL=metad FILE=dummy
-"""
-bias = PlumedBias(plumed_input)        # a new hills file is generated
-
-walker = BiasedDynamicWalker(data_train[0], bias=bias, timestep=0.5)    # initialize dynamic walker with bias
-state  = walker.propagate(model)                                        # performs biased MD
+
from psiflow.sampling import BiasedDynamicWalker, PlumedBias
+
+
+plumed_input = """
+UNITS LENGTH=A ENERGY=kj/mol TIME=fs
+CV: VOLUME
+METAD ARG=CV SIGMA=100 HEIGHT=2 PACE=10 LABEL=metad FILE=dummy
+"""
+bias = PlumedBias(plumed_input)        # a new hills file is generated
+
+walker = BiasedDynamicWalker(data_train[0], bias=bias, timestep=0.5)    # initialize dynamic walker with bias
+state  = walker.propagate(model)                                        # performs biased MD
 
Note that the bias instance will retain the hills that were generated during walker propagation. @@ -749,47 +795,47 @@

Bias potentials and enhanced samp The returned object is a Parsl Future that represents an ndarray of shape (nstates, 2). The first column represents the value of the collective variable for each state, and the second column contains the bias energy.

-

values = bias.evaluate(data_train, variable='CV')       # compute the collective variable 'CV' and bias energy
-
-assert values.result().shape[0] == data_train.length().result()  # each snapshot is evaluated separately
-assert values.result().shape[1] == 2                             # CV and bias per snapshot, in PLUMED units!
-assert not np.allclose(values.result()[:, 1], 0)                 # bias energy from added hills
+

values = bias.evaluate(data_train, variable='CV')       # compute the collective variable 'CV' and bias energy
+
+assert values.result().shape[0] == data_train.length().result()  # each snapshot is evaluated separately
+assert values.result().shape[1] == 2                             # CV and bias per snapshot, in PLUMED units!
+assert not np.allclose(values.result()[:, 1], 0)                 # bias energy from added hills
 
As another example, let's consider the same collective variable but now with a harmonic bias potential applied to it. Because sampling with and manipulation of harmonic bias potentials is ubiquitous in free energy calculations, psiflow provides specific functionalities for this particular case. -
plumed_input = """
-UNITS LENGTH=A ENERGY=kj/mol TIME=fs
-CV: VOLUME
-RESTRAINT ARG=CV AT=150 KAPPA=1 LABEL=restraint
-"""
-walker = BiasedDynamicWalker(data_train[0], bias=PlumedBias(plumed_input))  # walker with harmonic bias
-state = walker.propagate(model=model)                                           
-
-# change bias center and width
-walker.bias.adjust_restraint(variable='CV', kappa=2, center=200)
-state_ = walker.propagate(model)     
-
-# if the system had enough time to equilibrate with the bias, then the following should hold
-assert state.result().get_volume() < state_.result().get_volume()
+
plumed_input = """
+UNITS LENGTH=A ENERGY=kj/mol TIME=fs
+CV: VOLUME
+RESTRAINT ARG=CV AT=150 KAPPA=1 LABEL=restraint
+"""
+walker = BiasedDynamicWalker(data_train[0], bias=PlumedBias(plumed_input))  # walker with harmonic bias
+state = walker.propagate(model=model)                                           
+
+# change bias center and width
+walker.bias.adjust_restraint(variable='CV', kappa=2, center=200)
+state_ = walker.propagate(model)     
+
+# if the system had enough time to equilibrate with the bias, then the following should hold
+assert state.result().get_volume() < state_.result().get_volume()
 
Finally, psiflow also explicitly supports the use of numerical bias potentials as defined on a grid: -
import numpy as np
-from psiflow.sampling.bias import generate_external_grid
-
-bias_function = lambda x: np.exp(-0.01 * (x - 150) ** 2)    # Gaussian hill at CV=150
-grid_values   = np.linspace(0, 300, 500)                    # CV values for numerical grid
-
-grid = generate_external_grid(      # generate contents of PLUMED grid file
-        bias_function,
-        grid_values,                
-        'CV',                       # use ARG=CV in the EXTERNAL action
-        periodic=False,             # periodicity of CV
-        )
-bias = PlumedBias(plumed_input, data={'EXTERNAL': grid})   # pass grid file as external dependency
+
import numpy as np
+from psiflow.sampling.bias import generate_external_grid
+
+bias_function = lambda x: np.exp(-0.01 * (x - 150) ** 2)    # Gaussian hill at CV=150
+grid_values   = np.linspace(0, 300, 500)                    # CV values for numerical grid
+
+grid = generate_external_grid(      # generate contents of PLUMED grid file
+        bias_function,
+        grid_values,                
+        'CV',                       # use ARG=CV in the EXTERNAL action
+        periodic=False,             # periodicity of CV
+        )
+bias = PlumedBias(plumed_input, data={'EXTERNAL': grid})   # pass grid file as external dependency
 
Note that metadynamics hills files cannot be shared between walkers (as is the case in multiple walker metadynamics) as this would @@ -813,25 +859,25 @@

Level of theory

single FlowAtoms instance, and performs the single-point calculations. Depending on which argument it receives, it returns either a future or a Dataset which contain the QM energy, forces, and stress.

-

_, trajectory = walker.propagate(model=model, keep_trajectory=True)    # trajectory of states
-
-labeled = reference.evaluate(trajectory)  # massively parallel evaluation (returns new Dataset with results)   
-assert isinstance(labeled, Dataset)
-print(labeled[0].result().info['energy']) # cp2k potential energy!
-
-labeled = reference.evaluate(trajectory[0])     # evaluates single state (returns a FlowAtoms future)
-assert isinstance(labeled, AppFuture)
-assert isinstance(labeled.result(), FlowAtoms)
-print(labeled.result().info['energy'])          # will print the same energy
+

_, trajectory = walker.propagate(model=model, keep_trajectory=True)    # trajectory of states
+
+labeled = reference.evaluate(trajectory)  # massively parallel evaluation (returns new Dataset with results)   
+assert isinstance(labeled, Dataset)
+print(labeled[0].result().info['energy']) # cp2k potential energy!
+
+labeled = reference.evaluate(trajectory[0])     # evaluates single state (returns a FlowAtoms future)
+assert isinstance(labeled, AppFuture)
+assert isinstance(labeled.result(), FlowAtoms)
+print(labeled.result().info['energy'])          # will print the same energy
 
The output and error logs that were generated during the actual evaluation are automatically stored in case they need to checked for errors or unexpected behavior. Their location in the file system is kept track of using additional attributes provided by the FlowAtoms class:

-
assert labeled.result().reference_status    # True, because state is successfully evaluated
-print(labeled.result().reference_stdout)    # e.g. ./psiflow_internal/000/task_logs/0000/cp2k_evaluate.stdout
-print(labeled.result().reference_stderr)    # e.g. ./psiflow_internal/000/task_logs/0000/cp2k_evaluate.stderr
+
assert labeled.result().reference_status    # True, because state is successfully evaluated
+print(labeled.result().reference_stdout)    # e.g. ./psiflow_internal/000/task_logs/0000/cp2k_evaluate.stdout
+print(labeled.result().reference_stderr)    # e.g. ./psiflow_internal/000/task_logs/0000/cp2k_evaluate.stderr
 

CP2K

The CP2KReference expects a traditional CP2K @@ -840,20 +886,20 @@

CP2K

it should only contain the FORCE_EVAL section. Additional input files which define the basis sets, pseudopotentials, and dispersion correction parameters have to be added to the calculator after initialization. -
from psiflow.reference import CP2KReference
-
-
-cp2k_input = with file('cp2k_input.txt', 'r') as f: f.read()
-reference  = CP2KReference(cp2k_input)
-
-# register additional input files with the following mapping
-# if the corresponding keyword in the CP2K input file is X, use Y as key here:
-# X: BASIS_SET_FILE_NAME    ->   Y: basis_set
-# X: POTENTIAL_FILE_NAME    ->   Y: potential
-# X: PARAMETER_FILE_NAME    ->   Y: dftd3
-reference.add_file('basis_set', 'BASIS_MOLOPT_UZH')
-reference.add_file('potential', 'POTENTIAL_UZH')
-reference.add_file('dftd3', 'dftd3.dat')
+
from psiflow.reference import CP2KReference
+
+
+cp2k_input = with file('cp2k_input.txt', 'r') as f: f.read()
+reference  = CP2KReference(cp2k_input)
+
+# register additional input files with the following mapping
+# if the corresponding keyword in the CP2K input file is X, use Y as key here:
+# X: BASIS_SET_FILE_NAME    ->   Y: basis_set
+# X: POTENTIAL_FILE_NAME    ->   Y: potential
+# X: PARAMETER_FILE_NAME    ->   Y: dftd3
+reference.add_file('basis_set', 'BASIS_MOLOPT_UZH')
+reference.add_file('potential', 'POTENTIAL_UZH')
+reference.add_file('dftd3', 'dftd3.dat')
 

@@ -954,41 +1000,41 @@

Learning algorithms

the knowledge in the model with the states that were sampled by the walkers and evaluated with the chosen reference level of theory. Take a look at the following example: -
from psiflow.learning import SequentialLearning
-
-
-data_train = Dataset.load('initial_train.xyz')
-data_valid = Dataset.load('initial_valid.xyz')
-
-walkers = DynamicWalker.multiply(     # initializes 30 walkers, with different initial configuration and seed
-        30,
-        data_train,                   # Dataset which provides initial configurations
-        timestep=0.5,
-        steps=400,
-        step=50,
-        start=0,
-        temperature=600,
-        pressure=0, # NPT
-        force_threshold=30,
-        initial_temperature=600,
-        )
-
-learning = SequentialLearning(              # implements sequential learning
-        path_output=path_output,            # folder in which consecutive models and data should be saved
-        niterations=10,                     # number of (generate, train) iterations
-        train_from_scratch=True,            # whether to train with reinitialized weights in each iteration
-        train_valid_split=0.9,              # partitioning of generated states into training and validation
-        )
-
-data_train, data_valid = learning.run(
-        model=model,                                # initial model
-        reference=reference,                        # reference level of theory
-        walkers=walkers,                            # list of walkers
-        )
-
-model.save(path_output)                 # save new model separately
-data_train.save('final_train.xyz')      # save final training data
-data_valid.save('final_valid.xyz')      # save final validation data
+
from psiflow.learning import SequentialLearning
+
+
+data_train = Dataset.load('initial_train.xyz')
+data_valid = Dataset.load('initial_valid.xyz')
+
+walkers = DynamicWalker.multiply(     # initializes 30 walkers, with different initial configuration and seed
+        30,
+        data_train,                   # Dataset which provides initial configurations
+        timestep=0.5,
+        steps=400,
+        step=50,
+        start=0,
+        temperature=600,
+        pressure=0, # NPT
+        force_threshold=30,
+        initial_temperature=600,
+        )
+
+learning = SequentialLearning(              # implements sequential learning
+        path_output=path_output,            # folder in which consecutive models and data should be saved
+        niterations=10,                     # number of (generate, train) iterations
+        train_from_scratch=True,            # whether to train with reinitialized weights in each iteration
+        train_valid_split=0.9,              # partitioning of generated states into training and validation
+        )
+
+data_train, data_valid = learning.run(
+        model=model,                                # initial model
+        reference=reference,                        # reference level of theory
+        walkers=walkers,                            # list of walkers
+        )
+
+model.save(path_output)                 # save new model separately
+data_train.save('final_train.xyz')      # save final training data
+data_valid.save('final_valid.xyz')      # save final validation data
 
The learning.run() method implements the actual online learning algorithm. In this case, it will repeat the following @@ -1062,6 +1108,18 @@

Learning algorithms

+ + + + + + + + + + + + \ No newline at end of file diff --git a/sitemap.xml.gz b/sitemap.xml.gz index c3e80e1..1f5216c 100644 Binary files a/sitemap.xml.gz and b/sitemap.xml.gz differ