Source code for e13tools.sampling.lhs

# -*- coding: utf-8 -*-

"""
LHS
===
Provides a Latin Hypercube Sampling method.

Available functions
-------------------
:func:`~lhd`
    Generates a Latin Hypercube Design of `n_sam` samples, each with `n_val`
    values. Method for choosing the 'best' Latin Hypercube Design depends on
    the `method` and `criterion` that are used.

"""


# %% IMPORTS
# Package imports
import numpy as np
from numpy.random import choice, permutation, rand

# e13Tools imports
from e13tools.core import ShapeError
from e13tools.math import nCr
from e13tools.numpy import diff

# All declaration
__all__ = ['lhd']


# %% FUNCTIONS
# TODO: Check if MPI is possible
[docs]def lhd(n_sam, n_val, val_rng=None, method='random', criterion=None,
        iterations=1000, get_score=False, quickscan=True, constraints=None):
    """
    Generates a Latin Hypercube Design of `n_sam` samples, each with `n_val`
    values. Method for choosing the 'best' Latin Hypercube Design depends on
    the `method` and `criterion` that are used.

    Parameters
    ----------
    n_sam : int
        The number of samples to generate.
    n_val : int
        The number of values in a single sample.

    Optional
    --------
    val_rng : 2D array_like or None. Default: None
        Array defining the lower and upper limits of every value in a sample.
        Requires: numpy.shape(val_rng) = (`n_val`, 2).
        If *None*, output is normalized.
    method : {'random'; 'fixed'; 'center'}. Default: 'random'
        String specifying the method used to construct the Latin Hypercube
        Design. See ``Notes`` for more details.
        If `n_sam` == 1 or `n_val` == 1, `method` is set to the closest
        corresponding method if necessary.
    criterion : float, {'maximin'; 'correlation'; 'multi'} or None. \
        Default: None
        Float or string specifying the criterion the Latin Hypercube Design has
        to satisfy or *None* for no criterion. See ``Notes`` for more details.
        If `n_sam` == 1 or `n_val` == 1, `criterion` is set to the closest
        corresponding criterion if necessary.
    iterations : int. Default: 1000
        Number of iterations used for the criterion algorithm.
    get_score : bool. Default: False
        If *True*, the normalized maximin, correlation and multi scores are
        also returned if a criterion is used.
    quickscan : bool. Default: True
        If *True*, a faster but less precise algorithm will be used for the
        criteria.
    constraints : 2D array_like or None. Default: None
        If `constraints` is not empty and `criterion` is not *None*, `sam_set`
        + `constraints` will satisfy the given criterion instead of `sam_set`.
        Providing this argument when `criterion` is *None* will discard it.
        **WARNING**: If `constraints` is not a 'fixed' or 'center' lay-out LHD,
        the output might contain errors.

    Returns
    -------
    sam_set : 2D :obj:`~numpy.ndarray` object
        Sample set array of shape [`n_sam`, `n_val`].

    Notes
    -----
    The 'method' argument specifies the way in which the values should be
    distributed within the value intervals.

    The following methods can be used:

    ======== ===================================
    method   interval lay-out
    ======== ===================================
    'random' Values are randomized
    'fixed'  Values are fixed to maximize spread
    'center' Values are centered
    'r'      Same as 'random'
    'f'      Same as 'fixed'
    'c'      Same as 'center'
    ======== ===================================

    The 'fixed' method chooses values in such a way, that the distance between
    the values is maxed out.


    The 'criterion' argument specifies how much priority should be given to
    maximizing the minimum distance and minimizing the correlation between
    samples. Strings specify basic priority cases, while a value between 0 and
    1 specifies a custom case.

    The following criteria can be used (last column shows the equivalent
    float value):

    ============= ==================================================== ======
    criterion     effect/priority                                      equiv
    ============= ==================================================== ======
    None          No priority                                          --
    'maximin'     Maximum priority for maximizing the minimum distance 0.0
    'correlation' Maximum priority for minimizing the correlation      1.0
    'multi'       Equal priority for both                              0.5
    [0, 1]        Priority is given according to value provided        --
    ============= ==================================================== ======

    Examples
    --------
    Latin Hypercube with 5 samples with each 2 random, fixed or centered
    values:

        >>> import numpy as np
        >>> np.random.seed(0)
        >>> lhd(5, 2, method='random')
        array([[ 0.34303787,  0.55834501],
               [ 0.70897664,  0.70577898],
               [ 0.88473096,  0.19273255],
               [ 0.1097627 ,  0.91360891],
               [ 0.52055268,  0.2766883 ]])
        >>> lhd(5, 2, method='fixed')
        array([[ 0.5 ,  0.75],
               [ 0.25,  0.25],
               [ 0.  ,  1.  ],
               [ 0.75,  0.5 ],
               [ 1.  ,  0.  ]])
        >>> lhd(5, 2, method='center')
        array([[ 0.1,  0.9],
               [ 0.9,  0.5],
               [ 0.5,  0.7],
               [ 0.3,  0.3],
               [ 0.7,  0.1]])


    Latin Hypercube with 4 samples, 3 values in a specified value range:

        >>> import numpy as np
        >>> np.random.seed(0)
        >>> val_rng = [[0, 2], [1, 4], [0.3, 0.5]]
        >>> lhd(4, 3, val_rng=val_rng)
        array([[ 1.30138169,  2.41882975,  0.41686981],
               [ 0.27440675,  1.32819041,  0.48240859],
               [ 1.77244159,  3.53758114,  0.39180394],
               [ 0.85759468,  3.22274707,  0.31963924]])


    Latin Hypercubes can also be created by specifying a criterion with either
    a string or a normalized float. The strings identify basic float values.

        >>> import numpy as np
        >>> np.random.seed(0)
        >>> lhd(4, 3, method='fixed', criterion=0)
        array([[ 0.66666667,  0.        ,  0.66666667],
               [ 1.        ,  0.66666667,  0.        ],
               [ 0.33333333,  1.        ,  1.        ],
               [ 0.        ,  0.33333333,  0.33333333]])
        >>> np.random.seed(0)
        >>> lhd(4, 3, method='fixed', criterion='maximin')
        array([[ 0.66666667,  0.        ,  0.66666667],
               [ 1.        ,  0.66666667,  0.        ],
               [ 0.33333333,  1.        ,  1.        ],
               [ 0.        ,  0.33333333,  0.33333333]])

    """

    # Make sure that if val_rng is given, that it is valid
    if val_rng is not None:
        # If val_rng is 1D, convert it to 2D (expected for 'n_val' = 1)
        val_rng = np.array(val_rng, ndmin=2)

        # Check if the given val_rng is in the correct shape
        if not(val_rng.shape == (n_val, 2)):
            raise ShapeError("'val_rng' has incompatible shape: %s != (%s, %s)"
                             % (val_rng.shape, n_val, 2))

    # TODO: Implement constraints method again!
    # Make sure that constraints is a numpy array
    if constraints is not None:
        constraints = np.array(constraints, ndmin=2)

    # Check the shape of 'constraints' and act accordingly
    if constraints is None:
        pass
    elif(constraints.shape[-1] == 0):
        # If constraints is empty, there are no constraints
        constraints = None
    elif(constraints.ndim != 2):
        # If constraints is not two-dimensional, it is invalid
        raise ShapeError("Constraints must be two-dimensional!")
    elif(constraints.shape[-1] == n_val):
        # If constraints has the same number of values, it is valid
        constraints = _extract_sam_set(constraints, val_rng)
    else:
        # If not empty and not right shape, it is invalid
        raise ShapeError("Constraints has incompatible number of values: "
                         "%s =! %s" % (np.shape(constraints)[1], n_val))

    # Check for cases in which some methods make no sense
    if(n_sam == 1 and method.lower() in ('fixed', 'f')):
        method = 'center'
    elif(criterion is not None and method.lower() in ('random', 'r')):
        method = 'fixed'

    # Check for cases in which some criterions make no sense
    # If so, criterion will be changed to something useful
    if criterion is None:
        pass
    elif(n_sam == 1):
        criterion = None
    elif(n_val == 1 or n_sam == 2):
        criterion = None
    elif isinstance(criterion, (int, float)):
        if not(0 <= criterion <= 1):
            raise ValueError("Input argument 'criterion' can only have a "
                             "normalized value as a float value!")
    elif criterion.lower() not in ('maximin', 'correlation', 'multi'):
        raise ValueError("Input argument 'criterion' can only have {'maximin',"
                         " 'correlation', 'multi'} as string values!")

    # Pick correct lhs-method according to method
    if method.lower() in ('random', 'r'):
        sam_set = _lhd_random(n_sam, n_val)
    elif method.lower() in ('fixed', 'f'):
        sam_set = _lhd_fixed(n_sam, n_val)
    elif method.lower() in ('center', 'c'):
        sam_set = _lhd_center(n_sam, n_val)

    # Pick correct criterion
    if criterion is not None:
        multi_obj = Multi_LHD(sam_set, criterion, iterations, quickscan,
                              constraints)
        sam_set, mm_val, corr_val, multi_val = multi_obj()

    # If a val_rng was given, scale sam_set to this range
    if val_rng is not None:
        # Scale sam_set according to val_rng
        sam_set = val_rng[:, 0]+sam_set*(val_rng[:, 1]-val_rng[:, 0])

    if get_score and criterion is not None:
        return(sam_set, np.array([mm_val, corr_val, multi_val]))
    else:
        return(sam_set)


def _lhd_random(n_sam, n_val):
    # Generate the equally spaced intervals/bins
    bins = np.linspace(0, 1, n_sam+1)

    # Obtain lower and upper bounds of bins
    bins_low = bins[0:n_sam]
    bins_high = bins[1:n_sam+1]

    # Pair values randomly together to obtain random samples
    sam_set = np.zeros([n_sam, n_val])
    for i in range(n_val):
        sam_set[:, i] = permutation(bins_low+rand(n_sam)*(bins_high-bins_low))

    # Return sam_set
    return(sam_set)


def _lhd_fixed(n_sam, n_val):
    # Generate the maximally spaced values in every dimension
    val = np.linspace(0, 1, n_sam)

    # Pair values randomly together to obtain random samples
    sam_set = np.zeros([n_sam, n_val])
    for i in range(n_val):
        sam_set[:, i] = permutation(val)

    # Return sam_set
    return(sam_set)


def _lhd_center(n_sam, n_val):
    # Generate the equally spaced intervals/bins
    bins = np.linspace(0, 1, n_sam+1)

    # Obtain lower and upper bounds of bins
    bins_low = bins[0:n_sam]
    bins_high = bins[1:n_sam+1]

    # Capture centers of every bin
    center_num = (bins_low+bins_high)/2

    # Pair values randomly together to obtain random samples
    sam_set = np.zeros([n_sam, n_val])
    for i in range(n_val):
        sam_set[:, i] = permutation(center_num)

    # Return sam_set
    return(sam_set)


class Multi_LHD(object):
    def __init__(self, sam_set, criterion, iterations, quickscan, constraints):
        # Save all arguments as class attributes
        self.sam_set = sam_set
        self.iterations = iterations
        self.quickscan = quickscan
        self.n_sam, self.n_val = self.sam_set.shape

        # Combine constraints with sam_set
        if constraints is not None:
            self.n_sam_c, _ = constraints.shape
            self.sam_set = np.concatenate([self.sam_set, constraints], axis=0)
        else:
            self.n_sam_c = 0

        self.sam_set = self.sam_set*(self.n_sam-1)
        self.p = 15

        # Check criterion type and act accordingly
        if isinstance(criterion, (int, float)):
            self.importance = criterion
        elif(criterion.lower() == 'maximin'):
            self.importance = 0
        elif(criterion.lower() == 'correlation'):
            self.importance = 1
        elif(criterion.lower() == 'multi'):
            self.importance = 0.5

        # Obtain maximin-rank boundaries
        self._get_mm_bounds()

    def __call__(self):
        self._lhd_multi()
        return(self.sam_set_best[:self.n_sam]/(self.n_sam-1), self.mm_tot_best,
               np.sqrt(self.corr_tot_best), self.multi_val_best)
#        return(self.sam_set_best, self.mm_tot_best,
#               np.sqrt(self.corr_tot_best), self.multi_val_best)

    def _get_mm_bounds(self):
        # Calculate the p_dist of the provided sam_set
        p_dist_slice = self.n_sam_c*self.n_sam+nCr(self.n_sam, 2)
        p_dist = abs(diff(self.sam_set, axis=0, flatten=True))[:p_dist_slice]

        # Calculate the average distance between randomly chosen samples
        self.dist_avg = np.average(np.sum(p_dist, axis=-1))

        # TODO: Look at calculation of mm_lower once more
        # Calculate lower and upper boundaries of the maximin-rank
        self.mm_lower = pow(nCr(self.n_sam, 2), 1/self.p)/self.dist_avg
        p_dist_sort = np.sum(np.sort(p_dist, axis=0), axis=-1)

        # TODO: This has many exception cases, so maybe make an extra method?
        # If (due to constraints) any value in p_dist is zero, change p_dist in
        # such a way that it resembles the worst p_dist possible with no zeros
        if((p_dist_sort == 0).any()):
            values, counts = np.unique(p_dist_sort, return_counts=True)
#            p_dist_sort[:counts[0]] += values[1]/self.n_val
#            p_dist_sort[counts[0]:counts[1]] -= values[1]/self.n_val
            p_dist_sort[:counts[0]] += np.min(p_dist[p_dist != 0])
            p_dist_sort[counts[0]:counts[1]] -= np.min(p_dist[p_dist != 0])

        self.mm_upper = pow(np.sum(pow(p_dist_sort, -self.p)), 1/self.p)

    # TODO: If method is 'random', maybe randomly generate a new value in the
    # interval the values belong to after swapping. This way, the LHD will
    # actually be random
    def _lhd_multi(self):
        # Calculate the rank of this design
        sam_set = self.sam_set
        p_dist, mm_val = self._get_mm_val_init(sam_set)
        mm_tot = self._get_mm_tot(mm_val)
        corr_val = self._get_corr_val(sam_set)
        corr_tot = self._get_corr_tot(corr_val)
        multi_val = self._get_multi_val(corr_tot, mm_tot)

        # Save a copy of sam_set as sam_set_try
        sam_set_best = sam_set.copy()
        corr_tot_best = corr_tot
        mm_tot_best = mm_tot
        multi_val_best = multi_val

        # Maximize the minimum distance between points
        if self.quickscan:
            It = 0
            corr_val_try = corr_val
            corr_tot_try = corr_tot
            p_dist_try = p_dist
            mm_val_try = mm_val
            mm_tot_try = mm_tot
            multi_val_try = multi_val
            sam_set_try = sam_set.copy()
            while(It < self.iterations):
                # Randomly pick a column
                col = corr_val_try.argmax()
                rows = [mm_val_try.argmax()]
                rows_p = list(range(self.n_sam))
                rows_p.remove(rows[0])
                rows.append(choice(rows_p))

                # Swap elements
                val = sam_set_try[rows[0], col]
                sam_set_try[rows[0], col] = sam_set_try[rows[1], col]
                sam_set_try[rows[1], col] = val

                # Calculate the multi rank of sam_set_try
                p_dist_try, mm_val_try = self._get_mm_val(sam_set_try,
                                                          p_dist_try,
                                                          mm_val_try,
                                                          rows[0],
                                                          rows[1])
                mm_tot_try = self._get_mm_tot(mm_val_try)
                corr_val_try = self._get_corr_val(sam_set_try)
                corr_tot_try = self._get_corr_tot(corr_val_try)
                multi_val_try = self._get_multi_val(corr_tot_try, mm_tot_try)

                # If this rank is lower than current best rank, save sam_set
                if(multi_val_try < multi_val_best):
                    multi_val_best = multi_val_try
                    corr_tot_best = corr_tot_try
                    mm_tot_best = mm_tot_try
                    sam_set_best = sam_set_try.copy()
                    It = 0
                else:
                    It += 1

        else:
            t = 1
            flag = 1
            while(flag == 1):
                It = 0
                flag = 0
                while(It < self.iterations):
                    corr_val_try = corr_val
                    corr_tot_try = corr_tot
                    p_dist_try = p_dist
                    mm_val_try = mm_val
                    mm_tot_try = mm_tot
                    multi_val_try = multi_val
                    sam_set_try = sam_set.copy()

                    # Randomly pick a column
                    col = corr_val_try.argmax()
                    rows = [mm_val_try.argmax()]
                    rows_p = list(range(self.n_sam))
                    rows_p.remove(rows[0])
                    rows.append(choice(rows_p))

                    # Swap elements
                    val = sam_set_try[rows[0], col]
                    sam_set_try[rows[0], col] = sam_set_try[rows[1], col]
                    sam_set_try[rows[1], col] = val

                    # Calculate the multi rank of sam_set_try
                    p_dist_try, mm_val_try = self._get_mm_val(sam_set_try,
                                                              p_dist_try,
                                                              mm_val_try,
                                                              rows[0],
                                                              rows[1])
                    mm_tot_try = self._get_mm_tot(mm_val_try)
                    corr_val_try = self._get_corr_val(sam_set_try)
                    corr_tot_try = self._get_corr_tot(corr_val_try)
                    multi_val_try = self._get_multi_val(corr_tot_try,
                                                        mm_tot_try)

                    # If this rank is lower than the current rank, save sam_set
                    if(multi_val_try < multi_val or
                       (multi_val_try > multi_val and
                            rand() < np.exp((multi_val-multi_val_try)/t))):
                        corr_val = corr_val_try
                        corr_tot = corr_tot_try
                        p_dist = p_dist_try
                        mm_val = mm_val_try
                        mm_tot = mm_tot_try
                        multi_val = multi_val_try
                        sam_set = sam_set_try.copy()
                        flag = 1
                    # If this rank is lower than the current best rank, save it
                    if(multi_val_try < multi_val_best):
                        multi_val_best = multi_val_try
                        corr_tot_best = corr_tot_try
                        mm_tot_best = mm_tot_try
                        sam_set_best = sam_set_try.copy()
                        It = 0
                    else:
                        It += 1

                # Decrease temperature by 10%
                t *= 0.9

        # Return sam_set
        self.sam_set_best = sam_set_best
        self.corr_tot_best = corr_tot_best
        self.mm_tot_best = mm_tot_best
        self.multi_val_best = multi_val_best

    def _get_corr_val(self, sam_set):
        # Obtain the cross-correlation values between all columns in sam_set
        cross_corr = np.corrcoef(sam_set, rowvar=False)
        cross_corr[np.eye(self.n_val, dtype=int) == 1] = 0

        # Calculate the squared correlation values of all columns
        corr_val = np.sum(pow(cross_corr, 2), axis=-1)/(self.n_val-1)

        # Return it
        return(corr_val)

    def _get_mm_val_init(self, sam_set):
        # TODO: Check if masked arrays are really faster than lots of indexing
        # Calculate the pair-wise point distances
        masked_idx = np.diag_indices(self.n_sam, ndim=2)
        p_dist = np.sum(abs(diff(sam_set, axis=0, flatten=False)), axis=-1)
        p_dist = np.ma.array(p_dist, mask=False, hard_mask=True)
        p_dist.mask[masked_idx] = True
        p_dist.mask[self.n_sam:, self.n_sam:] = True

        # Create empty array containing the maximin values of all rows
        mm_val = np.zeros(self.n_sam)

        # Calculate maximin values
        for i, row_dist in enumerate(p_dist[:self.n_sam]):
            mm_val[i] = pow(np.sum(pow(row_dist, -self.p)), 1/self.p)

        # Return it
        return(p_dist, mm_val)

    def _get_mm_val(self, sam_set, p_dist_old, mm_val_old, r1, r2):
        # Create arrays containing new p_dist and mm_val
        p_dist = p_dist_old.copy()
        mm_val = np.zeros(self.n_sam)

        # Calculate new p_dist of row r1 and r2
        p_dist_r1 = np.sum(abs(diff(sam_set[r1], sam_set)), axis=-1)
        p_dist_r2 = np.sum(abs(diff(sam_set[r2], sam_set)), axis=-1)

        # Update p_dist and mm_val with newly calculated values
        p_dist[r1] = p_dist[:, r1] = p_dist_r1
        p_dist[r2] = p_dist[:, r2] = p_dist_r2
        iets1 = p_dist_r1[np.arange(self.n_sam+self.n_sam_c) != r1]
        iets2 = p_dist_r2[np.arange(self.n_sam+self.n_sam_c) != r2]
        if((iets1 == 0).any()):
            mm_val[r1] = np.infty
        else:
            mm_val[r1] = pow(np.sum(pow(iets1, -self.p)), 1/self.p)
        if((iets2 == 0).any()):
            mm_val[r2] = np.infty
        else:
            mm_val[r2] = pow(np.sum(pow(iets2, -self.p)), 1/self.p)

        # Create list containing only indices of unchanged rows
        idx = list(range(self.n_sam))
        idx.remove(r1)
        idx.remove(r2)

        # Update the mm_val of all unchanged rows
        mm_val[idx] = pow(pow(mm_val_old[idx], self.p) -
                          pow(p_dist_old[r1, idx], -self.p) -
                          pow(p_dist_old[r2, idx], -self.p) +
                          pow(p_dist_r1[idx], -self.p) +
                          pow(p_dist_r2[idx], -self.p),
                          1/self.p)

        # Return it
        return(p_dist, mm_val)

    def _get_mm_tot(self, mm_val):
        # Calculate the total mm_val
        mm_tot = pow(0.5*np.sum(pow(mm_val, self.p)), 1/self.p)
        return(((mm_tot-self.mm_lower)/(self.mm_upper-self.mm_lower)))

    def _get_corr_tot(self, corr_val):
        # Calculate the total corr_val
        return(np.sum(corr_val)/(self.n_val))

    def _get_multi_val(self, corr_tot, mm_tot):
        # Combine corr_tot and mm_tot to the multi_val
        return(self.importance*corr_tot+(1-self.importance)*mm_tot)


def _extract_sam_set(sam_set, val_rng):
    """
    Extracts the samples from `sam_set` that are within the given value
    ranges `val_rng`. Also extracts the two samples that are the closest to the
    given value ranges, but are outside of it.

    Parameters
    ----------
    sam_set : 2D array_like
        Sample set containing the samples that require extraction.
    val_rng : 2D array_like or None
        Array defining the lower and upper limits of every value in a sample.
        If *None*, output is normalized.

    Returns
    -------
    ext_sam_set : 2D array_like
        Sample set containing the extracted samples.

    """

    # Obtain number of values in number of samples
    n_sam, n_val = sam_set.shape

    # Check if val_rng is given. If not, set it to default range
    if val_rng is None:
        val_rng = np.zeros([n_val, 2])
        val_rng[:, 1] = 1

    # Scale all samples to the value range [0, 1]
    sam_set = ((sam_set-val_rng[:, 0])/(val_rng[:, 1]-val_rng[:, 0]))

    # Create empty array of valid samples
    ext_sam_set = []

    # Create lower and upper limits of the hypercube containing samples that
    # can influence the created hypercube
    lower_lim = 0-np.sqrt(n_val)
    upper_lim = 1+np.sqrt(n_val)

    # Check which samples are within val_rng or just outside of it
    for i in range(n_sam):
        # If a sample is within the outer hypercube, save it
        if(((lower_lim <= sam_set[i, :])*(sam_set[i, :] <= upper_lim)).all()):
            ext_sam_set.append(sam_set[i, :])

    # Return sam_set
    return(np.array(ext_sam_set))