# Copyright (c) [2024-2025] [Grogupy Team]
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
from concurrent.futures import ThreadPoolExecutor
from typing import TYPE_CHECKING, Union
import numpy as np
from numpy.typing import NDArray
from grogupy._tqdm import _tqdm
from grogupy.config import CONFIG
from grogupy.physics.utilities import interaction_energy
from .utilities import calc_Vu, onsite_projection, tau_u
if TYPE_CHECKING:
from grogupy.physics.builder import Builder
if CONFIG.is_GPU:
# initialize parallel GPU stuff
import cupy as cp
from cupy.typing import NDArray as CNDArray
# Disable memory pool for device memory (GPU)
cp.cuda.set_allocator(None)
# Disable memory pool for pinned memory (CPU).
cp.cuda.set_pinned_memory_allocator(None)
def gpu_solver(
max_g_per_loop: int,
mode: str,
gpu_number: int,
kpoints: list[NDArray],
kweights: list[NDArray],
SBI: NDArray,
SBI1: NDArray,
SBI2: NDArray,
Ruc: NDArray,
sc_off: NDArray,
samples: NDArray,
G_mag: NDArray,
G_pair_ij: NDArray,
G_pair_ji: NDArray,
rotated_H: NDArray,
S: NDArray,
rot_num: Union[int, str] = "Unknown",
) -> tuple["CNDArray", "CNDArray", "CNDArray"]:
"""Parallelizes the Green's function solution on GPU.
Should be used on large systems.
Parameters
----------
max_g_per_loop: int
Maximum number of greens function samples per loop
mode : str
The Greens function solver, which can be parallel or sequential
gpu_number : int
The ID of the GPU which we want to run on
kpoints : list[NDArray]
The kpoints already split for the GPUs
kweights : list[NDArray]
The kpoint weights already split for the GPUs
SBI : list[NDArray]
Spin box indices for the magnetic entities
SBI1 : list[NDArray]
Spin box indices for the pairs
SBI2 : list[NDArray]
Spin box indices for the pairs
Ruc : list[NDArray]
Unit cell shift of the pairs
sc_off : NDArray
List of unit cell shifts for unit cell indexes
samples : NDArray
Energy samples
G_mag : list[NDArray]
Empty container for the final Green's function on each magnetic entity
G_pair_ij : list[NDArray]
Empty container for the final Green's function on each pair
G_pair_ji : list[NDArray]
Empty container for the final Green's function on each pair
rotated_H : list[NDArray]
Hamiltonian with rotated exchange field
S : NDArray
Overlap matrix, should be the same for all Hamiltonians
rot_num: str, optional
Rotation number for tqdm print, by default "Unknown"
Returns
-------
local_G_mag : CNDArray
The Greens function of the mangetic entities
local_G_pair_ij : CNDArray
The Greens function from mangetic entity i to j on the given GPU
local_G_pair_ji : CNDArray
The Greens function from mangetic entity j to i on the given GPU
"""
# use the specified GPU
with cp.cuda.Device(gpu_number):
# copy everything to GPU
kpoints = cp.array(kpoints[gpu_number])
kweights = cp.array(kweights[gpu_number])
SBI = cp.array(SBI)
SBI1 = cp.array(SBI1)
SBI2 = cp.array(SBI2)
supercell_shift = cp.array(Ruc)
sc_off = cp.array(sc_off)
eset = samples.shape[0]
samples = cp.array(samples.reshape(eset, 1, 1))
G_mag = np.zeros_like(G_mag)
G_pair_ij = np.zeros_like(G_pair_ij)
G_pair_ji = np.zeros_like(G_pair_ji)
for i in _tqdm(
range(len(kpoints)),
desc=f"Rotation {rot_num}, parallel over k on GPU{gpu_number+1}",
):
# weight of k point in BZ integral
wk = kweights[i]
k = kpoints[i]
# calculate Hamiltonian and Overlap matrix in a given k point
# this generates the list of phases
phases = cp.exp(-1j * 2 * cp.pi * k @ sc_off.T)
# phases applied to the hamiltonian
HK = cp.einsum("abc,a->bc", cp.array(rotated_H), phases)
SK = cp.einsum("abc,a->bc", cp.array(S), phases)
# solve the Greens function on all energy points in one step
if mode[0].lower() == "p":
Gk = cp.linalg.inv(SK * samples - HK)
# store the Greens function slice of the magnetic entities
for l, sbi in enumerate(SBI):
G_mag[l] += onsite_projection(Gk, sbi, sbi).get() * wk
# store the Greens function slice of the pairs
for l, dat in enumerate(zip(SBI1, SBI2, supercell_shift)):
sbi1, sbi2, ruc = dat
phase = cp.exp(1j * 2 * cp.pi * k @ ruc.T)
G_pair_ij[l] += (
onsite_projection(Gk, sbi1, sbi2).get() * wk * phase
)
G_pair_ji[l] += (
onsite_projection(Gk, sbi2, sbi1).get() * wk / phase
)
# solve Greens function sequentially for the energies, because of memory bound
elif mode[0].lower() == "s":
# make chunks for reduced parallelization over energy sample points
number_of_chunks = np.floor(eset / max_g_per_loop) + 1
# constrain to sensible size
if number_of_chunks > eset:
number_of_chunks = eset
# create batches using slices on every instance
slices = np.array_split(range(eset), number_of_chunks)
# fills the holders sequentially by the Greens function slices on
# a given energy
for slice in slices:
Gk = cp.linalg.inv(SK * samples[slice] - HK)
# store the Greens function slice of the magnetic entities
for l, sbi in enumerate(SBI):
G_mag[l][slice] += (
onsite_projection(Gk, sbi, sbi).get() * wk
)
# store the Greens function slice of the pairs
for l, dat in enumerate(zip(SBI1, SBI2, Ruc)):
sbi1, sbi2, ruc = dat
phase = cp.exp(1j * 2 * cp.pi * k @ ruc.T)
G_pair_ij[l][slice] += (
onsite_projection(Gk, sbi1, sbi2).get() * wk * phase
)
G_pair_ji[l][slice] += (
onsite_projection(Gk, sbi2, sbi1).get() * wk / phase
)
return G_mag, G_pair_ij, G_pair_ji
def solve_parallel_over_k(builder: "Builder", print_memory: bool = False) -> None:
"""It calculates the energies by the Greens function method.
It inverts the Hamiltonians of all directions set up in the given
k-points at the given energy levels. The solution is parallelized over
k-points. It uses the number of GPUs given. And determines the parallelization
over energy levels from the ``builder.greens_function_solver`` attribute.
Parameters
----------
builder : Builder
The system that we want to solve
print_memory: bool, optional
It can be turned on to print extra memory info, by default False
"""
parallel_size = CONFIG.parallel_size
# checks for setup
if builder.kspace is None:
raise Exception("Kspace is not defined!")
if builder.contour is None:
raise Exception("Kspace is not defined!")
if builder.hamiltonian is None:
raise Exception("Kspace is not defined!")
# reset hamiltonians, magnetic entities and pairs
builder._rotated_hamiltonians = []
for mag_ent in builder.magnetic_entities:
mag_ent.reset()
for pair in builder.pairs:
pair.reset()
# calculate and print memory stuff
# 16 is the size of complex numbers in byte, when using np.float64
H_mem = np.sum(
[
np.prod(builder.hamiltonian.H.shape) * 16,
np.prod(builder.hamiltonian.S.shape) * 16,
]
)
mag_ent_mem = (
builder.contour.eset * builder.magnetic_entities.SBS**2
).sum() * 16
pair_mem = (
builder.contour.eset * builder.pairs.SBS1 * builder.pairs.SBS2
).sum() * 16
if print_memory:
print("\n\n\n")
print(
"################################################################################"
)
print(
"################################################################################"
)
print(f"Memory allocated by rotated Hamilonian: {H_mem/1e6} MB")
print(f"Memory allocated by magnetic entities: {mag_ent_mem/1e6} MB")
print(f"Memory allocated by pairs: {pair_mem/1e6} MB")
print(
f"Total memory allocated in RAM: {(H_mem+mag_ent_mem+pair_mem) / 1e6} MB"
)
print(
"--------------------------------------------------------------------------------"
)
if builder.greens_function_solver[0].lower() == "p": # parallel solver
G_mem = builder.contour.eset * np.prod(builder.hamiltonian.H.shape) * 16
elif builder.greens_function_solver[0].lower() == "s": # sequentia solver
G_mem = (
builder.max_g_per_loop * np.prod(builder.hamiltonian.H.shape) * 16
)
else:
raise Exception("Unknown Greens function solver!")
print("Memory allocated on GPU:")
print(f"Memory allocated by rotated Hamilonian: {H_mem/1e6} MB")
print(f"Memory allocated for Greens function samples: {G_mem/1e6} MB")
# 25 is the maximum amount of memory used for matrix inversion
gpu_max_mem = np.max([mag_ent_mem + pair_mem + H_mem, G_mem * 25]) / 1e6
print(f"Total peak memory on GPU during solution: {gpu_max_mem} MB")
print(
"################################################################################"
)
print(
"################################################################################"
)
print("\n\n\n")
# iterate over the reference directions (quantization axes)
for i, orient in enumerate(builder.ref_xcf_orientations):
# obtain rotated Hamiltonian
if builder.low_memory_mode:
rot_H = builder.hamiltonian
else:
rot_H = builder.hamiltonian.copy()
if not np.allclose(rot_H.orientation, orient["o"]):
rot_H.rotate(orient["o"])
# setup empty Greens function holders for integration and
# initialize rotation storage
for mag_ent in builder.magnetic_entities:
mag_ent._Vu1_tmp = []
mag_ent._Vu2_tmp = []
mag_ent._Gii_tmp = np.zeros(
(builder.contour.eset, mag_ent.SBS, mag_ent.SBS),
dtype="complex128",
)
for pair in builder.pairs:
pair._Gij_tmp = np.zeros(
(builder.contour.eset, pair.SBS1, pair.SBS2), dtype="complex128"
)
pair._Gji_tmp = np.zeros(
(builder.contour.eset, pair.SBS2, pair.SBS1), dtype="complex128"
)
# split k points to parallelize
# (this could be outside loop, but it was an easy fix for the
# reset of tqdm in each reference direction)
parallel_k = np.array_split(builder.kspace.kpoints, parallel_size)
parallel_w = np.array_split(builder.kspace.weights, parallel_size)
with ThreadPoolExecutor(max_workers=parallel_size) as executor:
futures = [
executor.submit(
gpu_solver,
builder.max_g_per_loop,
builder.greens_function_solver,
gpu_number,
parallel_k,
parallel_w,
builder.magnetic_entities.SBI,
builder.pairs.SBI1,
builder.pairs.SBI2,
builder.pairs.supercell_shift,
builder.hamiltonian.sc_off,
builder.contour.samples,
builder.magnetic_entities._Gii_tmp,
builder.pairs._Gij_tmp,
builder.pairs._Gji_tmp,
builder.hamiltonian.H,
builder.hamiltonian.S,
i + 1,
)
for gpu_number in range(parallel_size)
]
results = [future.result() for future in futures]
# Combine results
G_mag_reduce = np.zeros_like(builder.magnetic_entities._Gii_tmp)
G_pair_ij_reduce = np.zeros_like(builder.pairs._Gij_tmp)
G_pair_ji_reduce = np.zeros_like(builder.pairs._Gji_tmp)
for G_mag_local, G_pair_ij_local, G_pair_ji_local in results:
G_mag_reduce += G_mag_local
G_pair_ij_reduce += G_pair_ij_local
G_pair_ji_reduce += G_pair_ji_local
for i, mag_ent in enumerate(builder.magnetic_entities):
mag_ent._Gii_tmp = G_mag_reduce[i]
for i, pair in enumerate(builder.pairs):
pair._Gij_tmp = G_pair_ij_reduce[i]
pair._Gji_tmp = G_pair_ji_reduce[i]
# these are the rotations perpendicular to the quantization axis
for u in orient["vw"]:
# section 2.H
H_XCF = rot_H.extract_exchange_field()[3]
Tu: NDArray = np.kron(
np.eye(int(builder.hamiltonian.NO / 2), dtype=int), tau_u(u)
)
Vu1, Vu2 = calc_Vu(H_XCF[rot_H.uc_in_sc_index], Tu)
for mag_ent in _tqdm(
builder.magnetic_entities,
desc="Setup perturbations for rotated hamiltonian",
):
# fill up the perturbed potentials (for now) based on the on-site projections
mag_ent._Vu1_tmp.append(
onsite_projection(
Vu1, mag_ent._spin_box_indices, mag_ent._spin_box_indices
)
)
mag_ent._Vu2_tmp.append(
onsite_projection(
Vu2, mag_ent._spin_box_indices, mag_ent._spin_box_indices
)
)
if builder.spin_model == "isotropic-only":
for pair in builder.pairs:
pair.energies = np.array(
[
[
interaction_energy(
pair.M1._Vu1_tmp,
pair.M2._Vu1_tmp,
pair._Gij_tmp,
pair._Gji_tmp,
builder.contour.weights,
)
]
]
)
else:
# calculate energies in the current reference hamiltonian direction
for mag_ent in builder.magnetic_entities:
mag_ent.calculate_energies(
builder.contour.weights,
append=True,
third_direction=builder.spin_model == "generalised-grogu",
)
for pair in builder.pairs:
pair.calculate_energies(builder.contour.weights, append=True)
# if we want to keep all the information for some reason we can do it
if not builder.low_memory_mode:
builder._rotated_hamiltonians.append(rot_H)
for mag_ent in builder.magnetic_entities:
mag_ent._Vu1.append(mag_ent._Vu1_tmp)
mag_ent._Vu2.append(mag_ent._Vu2_tmp)
mag_ent._Gii.append(mag_ent._Gii_tmp)
for pair in builder.pairs:
pair._Gij.append(pair._Gij_tmp)
pair._Gji.append(pair._Gji_tmp)
# or fill with empty stuff
else:
for mag_ent in builder.magnetic_entities:
mag_ent._Vu1.append([])
mag_ent._Vu2.append([])
mag_ent._Gii.append([])
for pair in builder.pairs:
pair._Gij.append([])
pair._Gji.append([])
# rotate back hamiltonian for the original DFT orientation
rot_H.rotate(rot_H.scf_xcf_orientation)
# finalize energies of the magnetic entities and pairs
# calculate magnetic parameters
for mag_ent in builder.magnetic_entities:
# delete temporary stuff
del mag_ent._Gii_tmp
del mag_ent._Vu1_tmp
del mag_ent._Vu2_tmp
if builder.apply_spin_model:
if builder.spin_model == "generalised-fit":
mag_ent.fit_anisotropy_tensor(builder.ref_xcf_orientations)
elif builder.spin_model == "generalised-grogu":
mag_ent.calculate_anisotropy()
else:
pass
for pair in builder.pairs:
# delete temporary stuff
del pair._Gij_tmp
del pair._Gji_tmp
if builder.apply_spin_model:
if builder.spin_model == "generalised-fit":
pair.fit_exchange_tensor(builder.ref_xcf_orientations)
elif builder.spin_model == "generalised-grogu":
pair.calculate_exchange_tensor()
elif builder.spin_model == "isotropic-only":
pair.calculate_isotropic_only()
else:
raise Exception(
f"Unknown spin model: {builder.spin_model}! Use apply_spin_model=False"
)
else:
[docs]
def gpu_solver(
max_g_per_loop: int,
mode: str,
gpu_number: int,
kpoints: list[NDArray],
kweights: list[NDArray],
SBI: NDArray,
SBI1: NDArray,
SBI2: NDArray,
Ruc: NDArray,
sc_off: NDArray,
samples: NDArray,
G_mag: NDArray,
G_pair_ij: NDArray,
G_pair_ji: NDArray,
rotated_H: NDArray,
S: NDArray,
rot_num: Union[int, str] = "Unknown",
) -> tuple["CNDArray", "CNDArray", "CNDArray"]:
"""Parallelizes the Green's function solution on GPU.
Should be used on large systems.
Parameters
----------
max_g_per_loop: int
Maximum number of greens function samples per loop
mode : str
The Greens function solver, which can be parallel or sequential
gpu_number : int
The ID of the GPU which we want to run on
kpoints : list[NDArray]
The kpoints already split for the GPUs
kweights : list[NDArray]
The kpoint weights already split for the GPUs
SBI : list[NDArray]
Spin box indices for the magnetic entities
SBI1 : list[NDArray]
Spin box indices for the pairs
SBI2 : list[NDArray]
Spin box indices for the pairs
Ruc : list[NDArray]
Unit cell shift of the pairs
sc_off : NDArray
List of unit cell shifts for unit cell indexes
samples : NDArray
Energy samples
G_mag : list[NDArray]
Empty container for the final Green's function on each magnetic entity
G_pair_ij : list[NDArray]
Empty container for the final Green's function on each pair
G_pair_ji : list[NDArray]
Empty container for the final Green's function on each pair
rotated_H : list[NDArray]
Hamiltonian with rotated exchange field
S : NDArray
Overlap matrix, should be the same for all Hamiltonians
rot_num: str, optional
Rotation number for tqdm print, by default "Unknown"
Returns
-------
local_G_mag : CNDArray
The Greens function of the mangetic entities
local_G_pair_ij : CNDArray
The Greens function from mangetic entity i to j on the given GPU
local_G_pair_ji : CNDArray
The Greens function from mangetic entity j to i on the given GPU
"""
raise Exception("GPU is not available!")
[docs]
def solve_parallel_over_k(builder: "Builder", print_memory: bool = False) -> None:
"""It calculates the energies by the Greens function method.
It inverts the Hamiltonians of all directions set up in the given
k-points at the given energy levels. The solution is parallelized over
k-points. It uses the number of GPUs given. And determines the parallelization
over energy levels from the ``builder.greens_function_solver`` attribute.
Parameters
----------
builder : Builder
The system that we want to solve
print_memory: bool, optional
It can be turned on to print extra memory info, by default False
"""
raise Exception("GPU is not available!")
if __name__ == "__main__":
pass