#line 1 "numpy/core/src/umath/loops_exponent_log.dispatch.c.src"

/*
 *****************************************************************************
 **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
 **       Changes should be made to the original source (.src) file         **
 *****************************************************************************
 */

#line 1
/*@targets
 ** $maxopt baseline
 ** (avx2 fma3) avx512f avx512_skx
 **/

#define _UMATHMODULE
#define _MULTIARRAYMODULE
#define NPY_NO_DEPRECATED_API NPY_API_VERSION

#include <float.h>

#include "numpy/npy_math.h"
#include "simd/simd.h"
#include "loops_utils.h"
#include "loops.h"
#include "lowlevel_strided_loops.h"
// Provides the various *_LOOP macros
#include "fast_loop_macros.h"
#include "npy_simd_data.h"

// TODO: tweak & replace raw SIMD with NPYV

/********************************************************************************
 ** bunch of helper functions used in ISA_exp/log_FLOAT
 ********************************************************************************/
#if !defined(_MSC_VER) && defined(NPY_HAVE_AVX512F)
    /**
     * For somehow MSVC commit aggressive optimization lead
     * to raises 'RuntimeWarning: RuntimeWarning: overflow encountered in exp'
     *
     * the issue mainly caused by '_mm512_maskz_loadu_ps', we need to
     * investigate about it while moving to NPYV.
     */
    #define SIMD_AVX512F
#elif defined(NPY_HAVE_AVX2) && defined(NPY_HAVE_FMA3)
    #define SIMD_AVX2_FMA3
#endif
#if !defined(_MSC_VER) && defined(NPY_HAVE_AVX512_SKX)
    #define SIMD_AVX512_SKX
#endif
#if defined(SIMD_AVX512F) && !(defined(__clang__) && (__clang_major__ < 10 || \
                              (__clang_major__ == 10 && __clang_minor__ < 1)))
    #define SIMD_AVX512F_NOCLANG_BUG
#endif

#ifdef SIMD_AVX2_FMA3

static NPY_INLINE __m256
fma_get_full_load_mask_ps(void)
{
    return _mm256_set1_ps(-1.0);
}

static NPY_INLINE __m256i
fma_get_full_load_mask_pd(void)
{
    return _mm256_castpd_si256(_mm256_set1_pd(-1.0));
}

static NPY_INLINE __m256
fma_get_partial_load_mask_ps(const npy_int num_elem, const npy_int num_lanes)
{
    float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,
                            1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
    float* addr = maskint + num_lanes - num_elem;
    return _mm256_loadu_ps(addr);
}

static NPY_INLINE __m256i
fma_get_partial_load_mask_pd(const npy_int num_elem, const npy_int num_lanes)
{
    npy_int maskint[16] = {-1,-1,-1,-1,-1,-1,-1,-1,1,1,1,1,1,1,1,1};
    npy_int* addr = maskint + 2*num_lanes - 2*num_elem;
    return _mm256_loadu_si256((__m256i*) addr);
}

static NPY_INLINE __m256
fma_masked_gather_ps(__m256 src,
                     npy_float* addr,
                     __m256i vindex,
                     __m256 mask)
{
    return _mm256_mask_i32gather_ps(src, addr, vindex, mask, 4);
}

static NPY_INLINE __m256d
fma_masked_gather_pd(__m256d src,
                     npy_double* addr,
                     __m128i vindex,
                     __m256d mask)
{
    return _mm256_mask_i32gather_pd(src, addr, vindex, mask, 8);
}

static NPY_INLINE __m256
fma_masked_load_ps(__m256 mask, npy_float* addr)
{
    return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask));
}

static NPY_INLINE __m256d
fma_masked_load_pd(__m256i mask, npy_double* addr)
{
    return _mm256_maskload_pd(addr, mask);
}

static NPY_INLINE __m256
fma_set_masked_lanes_ps(__m256 x, __m256 val, __m256 mask)
{
    return _mm256_blendv_ps(x, val, mask);
}

static NPY_INLINE __m256d
fma_set_masked_lanes_pd(__m256d x, __m256d val, __m256d mask)
{
    return _mm256_blendv_pd(x, val, mask);
}

static NPY_INLINE __m256
fma_blend(__m256 x, __m256 y, __m256 ymask)
{
    return _mm256_blendv_ps(x, y, ymask);
}

static NPY_INLINE __m256
fma_invert_mask_ps(__m256 ymask)
{
    return _mm256_andnot_ps(ymask, _mm256_set1_ps(-1.0));
}

static NPY_INLINE __m256i
fma_invert_mask_pd(__m256i ymask)
{
    return _mm256_andnot_si256(ymask, _mm256_set1_epi32(0xFFFFFFFF));
}

static NPY_INLINE __m256
fma_get_exponent(__m256 x)
{
    /*
     * Special handling of denormals:
     * 1) Multiply denormal elements with 2**100 (0x71800000)
     * 2) Get the 8 bits of unbiased exponent
     * 3) Subtract 100 from exponent of denormals
     */

    __m256 two_power_100 = _mm256_castsi256_ps(_mm256_set1_epi32(0x71800000));
    __m256 denormal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_LT_OQ);
    __m256 normal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_GE_OQ);

    /*
     * It is necessary for temp1 to be volatile, a bug in clang optimizes it out which leads
     * to an overflow warning in some cases. See https://github.com/numpy/numpy/issues/18005
     */
    volatile __m256 temp1 = _mm256_blendv_ps(x, _mm256_set1_ps(0.0f), normal_mask);
    __m256 temp = _mm256_mul_ps(temp1, two_power_100);
    x = _mm256_blendv_ps(x, temp, denormal_mask);

    __m256 exp = _mm256_cvtepi32_ps(
                    _mm256_sub_epi32(
                        _mm256_srli_epi32(
                            _mm256_castps_si256(x), 23),_mm256_set1_epi32(0x7E)));

    __m256 denorm_exp = _mm256_sub_ps(exp, _mm256_set1_ps(100.0f));
    return _mm256_blendv_ps(exp, denorm_exp, denormal_mask);
}

static NPY_INLINE __m256
fma_get_mantissa(__m256 x)
{
    /*
     * Special handling of denormals:
     * 1) Multiply denormal elements with 2**100 (0x71800000)
     * 2) Get the 23 bits of mantissa
     * 3) Mantissa for denormals is not affected by the multiplication
     */

    __m256 two_power_100 = _mm256_castsi256_ps(_mm256_set1_epi32(0x71800000));
    __m256 denormal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_LT_OQ);
    __m256 normal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_GE_OQ);

    /*
     * It is necessary for temp1 to be volatile, a bug in clang optimizes it out which leads
     * to an overflow warning in some cases. See https://github.com/numpy/numpy/issues/18005
     */
    volatile __m256 temp1 = _mm256_blendv_ps(x, _mm256_set1_ps(0.0f), normal_mask);
    __m256 temp = _mm256_mul_ps(temp1, two_power_100);
    x = _mm256_blendv_ps(x, temp, denormal_mask);

    __m256i mantissa_bits = _mm256_set1_epi32(0x7fffff);
    __m256i exp_126_bits  = _mm256_set1_epi32(126 << 23);
    return _mm256_castsi256_ps(
                _mm256_or_si256(
                    _mm256_and_si256(
                        _mm256_castps_si256(x), mantissa_bits), exp_126_bits));
}

static NPY_INLINE __m256
fma_scalef_ps(__m256 poly, __m256 quadrant)
{
    /*
     * Handle denormals (which occur when quadrant <= -125):
     * 1) This function computes poly*(2^quad) by adding the exponent of
     poly to quad
     * 2) When quad <= -125, the output is a denormal and the above logic
     breaks down
     * 3) To handle such cases, we split quadrant: -125 + (quadrant + 125)
     * 4) poly*(2^-125) is computed the usual way
     * 5) 2^(quad-125) can be computed by: 2 << abs(quad-125)
     * 6) The final div operation generates the denormal
     */
     __m256 minquadrant = _mm256_set1_ps(-125.0f);
     __m256 denormal_mask = _mm256_cmp_ps(quadrant, minquadrant, _CMP_LE_OQ);
     if (_mm256_movemask_ps(denormal_mask) != 0x0000) {
        __m256 quad_diff = _mm256_sub_ps(quadrant, minquadrant);
        quad_diff = _mm256_sub_ps(_mm256_setzero_ps(), quad_diff);
        quad_diff = _mm256_blendv_ps(_mm256_setzero_ps(), quad_diff, denormal_mask);
        __m256i two_power_diff = _mm256_sllv_epi32(
                                   _mm256_set1_epi32(1), _mm256_cvtps_epi32(quad_diff));
        quadrant = _mm256_max_ps(quadrant, minquadrant); //keep quadrant >= -126
        __m256i exponent = _mm256_slli_epi32(_mm256_cvtps_epi32(quadrant), 23);
        poly = _mm256_castsi256_ps(
                   _mm256_add_epi32(
                       _mm256_castps_si256(poly), exponent));
        __m256 denorm_poly = _mm256_div_ps(poly, _mm256_cvtepi32_ps(two_power_diff));
        return _mm256_blendv_ps(poly, denorm_poly, denormal_mask);
     }
     else {
        __m256i exponent = _mm256_slli_epi32(_mm256_cvtps_epi32(quadrant), 23);
        poly = _mm256_castsi256_ps(
                   _mm256_add_epi32(
                       _mm256_castps_si256(poly), exponent));
        return poly;
     }
}

#endif // SIMD_AVX2_FMA3

#ifdef SIMD_AVX512F

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
avx512_get_full_load_mask_ps(void)
{
    return 0xFFFF;
}

static NPY_INLINE __mmask8
avx512_get_full_load_mask_pd(void)
{
    return 0xFF;
}

static NPY_INLINE __mmask16
avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem)
{
    return (0x0001 << num_elem) - 0x0001;
}

static NPY_INLINE __mmask8
avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem)
{
    return (0x01 << num_elem) - 0x01;
}

static NPY_INLINE __m512
avx512_masked_gather_ps(__m512 src,
                        npy_float* addr,
                        __m512i vindex,
                        __mmask16 kmask)
{
    return _mm512_mask_i32gather_ps(src, kmask, vindex, addr, 4);
}

static NPY_INLINE __m512d
avx512_masked_gather_pd(__m512d src,
                        npy_double* addr,
                        __m256i vindex,
                        __mmask8 kmask)
{
    return _mm512_mask_i32gather_pd(src, kmask, vindex, addr, 8);
}

static NPY_INLINE __m512
avx512_masked_load_ps(__mmask16 mask, npy_float* addr)
{
    return _mm512_maskz_loadu_ps(mask, (__m512 *)addr);
}

static NPY_INLINE __m512d
avx512_masked_load_pd(__mmask8 mask, npy_double* addr)
{
    return _mm512_maskz_loadu_pd(mask, (__m512d *)addr);
}

static NPY_INLINE __m512
avx512_set_masked_lanes_ps(__m512 x, __m512 val, __mmask16 mask)
{
    return _mm512_mask_blend_ps(mask, x, val);
}

static NPY_INLINE __m512d
avx512_set_masked_lanes_pd(__m512d x, __m512d val, __mmask8 mask)
{
    return _mm512_mask_blend_pd(mask, x, val);
}

static NPY_INLINE __m512
avx512_blend(__m512 x, __m512 y, __mmask16 ymask)
{
    return _mm512_mask_mov_ps(x, ymask, y);
}

static NPY_INLINE __mmask16
avx512_invert_mask_ps(__mmask16 ymask)
{
    return _mm512_knot(ymask);
}

static NPY_INLINE __mmask8
avx512_invert_mask_pd(__mmask8 ymask)
{
    return _mm512_knot(ymask);
}

static NPY_INLINE __m512
avx512_get_exponent(__m512 x)
{
    return _mm512_add_ps(_mm512_getexp_ps(x), _mm512_set1_ps(1.0f));
}

static NPY_INLINE __m512
avx512_get_mantissa(__m512 x)
{
    return _mm512_getmant_ps(x, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src);
}

static NPY_INLINE __m512
avx512_scalef_ps(__m512 poly, __m512 quadrant)
{
    return _mm512_scalef_ps(poly, quadrant);
}

static NPY_INLINE __m512d
avx512_permute_x4var_pd(__m512d t0,
                        __m512d t1,
                        __m512d t2,
                        __m512d t3,
                        __m512i index)
{
    __mmask8 lut_mask = _mm512_cmp_epi64_mask(
                          _mm512_and_epi64(_mm512_set1_epi64(0x10ULL), index),
                          _mm512_set1_epi64(0), _MM_CMPINT_GT);
    __m512d res1 = _mm512_permutex2var_pd(t0, index, t1);
    __m512d res2 = _mm512_permutex2var_pd(t2, index, t3);
    return _mm512_mask_blend_pd(lut_mask, res1, res2);
}

static NPY_INLINE __m512d
avx512_permute_x8var_pd(__m512d t0, __m512d t1, __m512d t2, __m512d t3,
                        __m512d t4, __m512d t5, __m512d t6, __m512d t7,
                        __m512i index)
{
    __mmask8 lut_mask = _mm512_cmp_epi64_mask(
                          _mm512_and_epi64(_mm512_set1_epi64(0x20ULL), index),
                          _mm512_set1_epi64(0), _MM_CMPINT_GT);
    __m512d res1 = avx512_permute_x4var_pd(t0, t1, t2, t3, index);
    __m512d res2 = avx512_permute_x4var_pd(t4, t5, t6, t7, index);
    return _mm512_mask_blend_pd(lut_mask, res1, res2);
}

#endif // SIMD_AVX512F

/********************************************************************************
 ** Defining the SIMD kernels
 ********************************************************************************/
#line 395
#ifdef SIMD_AVX2_FMA3
/*
 * Vectorized Cody-Waite range reduction technique
 * Performs the reduction step x* = x - y*C in three steps:
 * 1) x* = x - y*c1
 * 2) x* = x - y*c2
 * 3) x* = x - y*c3
 * c1, c2 are exact floating points, c3 = C - c1 - c2 simulates higher precision
 */
static NPY_INLINE __m256
simd_range_reduction(__m256 x, __m256 y, __m256 c1, __m256 c2, __m256 c3)
{
    __m256 reduced_x = _mm256_fmadd_ps(y, c1, x);
    reduced_x = _mm256_fmadd_ps(y, c2, reduced_x);
    reduced_x = _mm256_fmadd_ps(y, c3, reduced_x);
    return reduced_x;
}
/*
 * Vectorized implementation of exp using AVX2 and AVX512:
 * 1) if x >= xmax; return INF (overflow)
 * 2) if x <= xmin; return 0.0f (underflow)
 * 3) Range reduction (using Coyd-Waite):
 *      a) y = x - k*ln(2); k = rint(x/ln(2)); y \in [0, ln(2)]
 * 4) Compute exp(y) = P/Q, ratio of 2 polynomials P and Q
 *      b) P = 5th order and Q = 2nd order polynomials obtained from Remez's
 *      algorithm (mini-max polynomial approximation)
 * 5) Compute exp(x) = exp(y) * 2^k
 * 6) Max ULP error measured across all 32-bit FP's = 2.52 (x = 0xc2781e37)
 * 7) Max relative error measured across all 32-bit FP's= 2.1264E-07 (for the
 * same x = 0xc2781e37)
 */
static void
simd_exp_FLOAT(npy_float * op,
                npy_float * ip,
                const npy_intp array_size,
                const npy_intp steps)
{
    const npy_intp stride = steps/(npy_intp)sizeof(npy_float);
    const npy_int num_lanes = 32/(npy_intp)sizeof(npy_float);
    npy_float xmax = 88.72283935546875f;
    npy_float xmin = -103.97208404541015625f;

    /*
     * Note: while generally indices are npy_intp, we ensure that our maximum index
     * will fit in an int32 as a precondition for this function via
     * IS_OUTPUT_BLOCKABLE_UNARY
     */
    npy_int32 indexarr[16];
    for (npy_int32 ii = 0; ii < 16; ii++) {
        indexarr[ii] = ii*stride;
    }

    /* Load up frequently used constants */
    __m256 codyw_c1 = _mm256_set1_ps(NPY_CODY_WAITE_LOGE_2_HIGHf);
    __m256 codyw_c2 = _mm256_set1_ps(NPY_CODY_WAITE_LOGE_2_LOWf);
    __m256 exp_p0 = _mm256_set1_ps(NPY_COEFF_P0_EXPf);
    __m256 exp_p1 = _mm256_set1_ps(NPY_COEFF_P1_EXPf);
    __m256 exp_p2 = _mm256_set1_ps(NPY_COEFF_P2_EXPf);
    __m256 exp_p3 = _mm256_set1_ps(NPY_COEFF_P3_EXPf);
    __m256 exp_p4 = _mm256_set1_ps(NPY_COEFF_P4_EXPf);
    __m256 exp_p5 = _mm256_set1_ps(NPY_COEFF_P5_EXPf);
    __m256 exp_q0 = _mm256_set1_ps(NPY_COEFF_Q0_EXPf);
    __m256 exp_q1 = _mm256_set1_ps(NPY_COEFF_Q1_EXPf);
    __m256 exp_q2 = _mm256_set1_ps(NPY_COEFF_Q2_EXPf);
    __m256 cvt_magic = _mm256_set1_ps(NPY_RINT_CVT_MAGICf);
    __m256 log2e = _mm256_set1_ps(NPY_LOG2Ef);
    __m256 inf = _mm256_set1_ps(NPY_INFINITYF);
    __m256 zeros_f = _mm256_set1_ps(0.0f);
    __m256 poly, num_poly, denom_poly, quadrant;
    __m256i vindex = _mm256_loadu_si256((__m256i*)&indexarr[0]);

    __m256 xmax_mask, xmin_mask, nan_mask, inf_mask;
    __m256 overflow_mask = fma_get_partial_load_mask_ps(0, num_lanes);
    __m256 underflow_mask = fma_get_partial_load_mask_ps(0, num_lanes);
    __m256 load_mask = fma_get_full_load_mask_ps();
    npy_intp num_remaining_elements = array_size;

    while (num_remaining_elements > 0) {

        if (num_remaining_elements < num_lanes) {
            load_mask = fma_get_partial_load_mask_ps(num_remaining_elements,
                                                       num_lanes);
        }

        __m256 x;
        if (stride == 1) {
            x = fma_masked_load_ps(load_mask, ip);
        }
        else {
            x = fma_masked_gather_ps(zeros_f, ip, vindex, load_mask);
        }

        nan_mask = _mm256_cmp_ps(x, x, _CMP_NEQ_UQ);
        x = fma_set_masked_lanes_ps(x, zeros_f, nan_mask);

        xmax_mask = _mm256_cmp_ps(x, _mm256_set1_ps(xmax), _CMP_GE_OQ);
        xmin_mask = _mm256_cmp_ps(x, _mm256_set1_ps(xmin), _CMP_LE_OQ);
        inf_mask = _mm256_cmp_ps(x, inf, _CMP_EQ_OQ);
        overflow_mask = _mm256_or_ps(overflow_mask,
                                    _mm256_xor_ps(xmax_mask, inf_mask));
        underflow_mask = _mm256_or_ps(underflow_mask, xmin_mask);

        x = fma_set_masked_lanes_ps(x, zeros_f, _mm256_or_ps(
                                    _mm256_or_ps(nan_mask, xmin_mask), xmax_mask));

        quadrant = _mm256_mul_ps(x, log2e);

        /* round to nearest */
        quadrant = _mm256_add_ps(quadrant, cvt_magic);
        quadrant = _mm256_sub_ps(quadrant, cvt_magic);

        /* Cody-Waite's range reduction algorithm */
        x = simd_range_reduction(x, quadrant, codyw_c1, codyw_c2, zeros_f);

        num_poly = _mm256_fmadd_ps(exp_p5, x, exp_p4);
        num_poly = _mm256_fmadd_ps(num_poly, x, exp_p3);
        num_poly = _mm256_fmadd_ps(num_poly, x, exp_p2);
        num_poly = _mm256_fmadd_ps(num_poly, x, exp_p1);
        num_poly = _mm256_fmadd_ps(num_poly, x, exp_p0);
        denom_poly = _mm256_fmadd_ps(exp_q2, x, exp_q1);
        denom_poly = _mm256_fmadd_ps(denom_poly, x, exp_q0);
        poly = _mm256_div_ps(num_poly, denom_poly);

        /*
         * compute val = poly * 2^quadrant; which is same as adding the
         * exponent of quadrant to the exponent of poly. quadrant is an int,
         * so extracting exponent is simply extracting 8 bits.
         */
        poly = fma_scalef_ps(poly, quadrant);

        /*
         * elem > xmax; return inf
         * elem < xmin; return 0.0f
         * elem = +/- nan, return nan
         */
        poly = fma_set_masked_lanes_ps(poly, _mm256_set1_ps(NPY_NANF), nan_mask);
        poly = fma_set_masked_lanes_ps(poly, inf, xmax_mask);
        poly = fma_set_masked_lanes_ps(poly, zeros_f, xmin_mask);

        _mm256_maskstore_ps(op, _mm256_cvtps_epi32(load_mask), poly);

        ip += num_lanes*stride;
        op += num_lanes;
        num_remaining_elements -= num_lanes;
    }

    if (_mm256_movemask_ps(overflow_mask)) {
        npy_set_floatstatus_overflow();
    }

    if (_mm256_movemask_ps(underflow_mask)) {
        npy_set_floatstatus_underflow();
    }
}

/*
 * Vectorized implementation of log using AVX2 and AVX512
 * 1) if x < 0.0f; return -NAN (invalid input)
 * 2) Range reduction: y = x/2^k;
 *      a) y = normalized mantissa, k is the exponent (0.5 <= y < 1)
 * 3) Compute log(y) = P/Q, ratio of 2 polynomials P and Q
 *      b) P = 5th order and Q = 5th order polynomials obtained from Remez's
 *      algorithm (mini-max polynomial approximation)
 * 5) Compute log(x) = log(y) + k*ln(2)
 * 6) Max ULP error measured across all 32-bit FP's = 3.83 (x = 0x3f486945)
 * 7) Max relative error measured across all 32-bit FP's = 2.359E-07 (for same
 * x = 0x3f486945)
 */

static void
simd_log_FLOAT(npy_float * op,
                npy_float * ip,
                const npy_intp array_size,
                const npy_intp steps)
{
    const npy_intp stride = steps/(npy_intp)sizeof(npy_float);
    const npy_int num_lanes = 32/(npy_intp)sizeof(npy_float);

    /*
     * Note: while generally indices are npy_intp, we ensure that our maximum index
     * will fit in an int32 as a precondition for this function via
     * IS_OUTPUT_BLOCKABLE_UNARY
     */
    npy_int32 indexarr[16];
    for (npy_int32 ii = 0; ii < 16; ii++) {
        indexarr[ii] = ii*stride;
    }

    /* Load up frequently used constants */
    __m256 log_p0 = _mm256_set1_ps(NPY_COEFF_P0_LOGf);
    __m256 log_p1 = _mm256_set1_ps(NPY_COEFF_P1_LOGf);
    __m256 log_p2 = _mm256_set1_ps(NPY_COEFF_P2_LOGf);
    __m256 log_p3 = _mm256_set1_ps(NPY_COEFF_P3_LOGf);
    __m256 log_p4 = _mm256_set1_ps(NPY_COEFF_P4_LOGf);
    __m256 log_p5 = _mm256_set1_ps(NPY_COEFF_P5_LOGf);
    __m256 log_q0 = _mm256_set1_ps(NPY_COEFF_Q0_LOGf);
    __m256 log_q1 = _mm256_set1_ps(NPY_COEFF_Q1_LOGf);
    __m256 log_q2 = _mm256_set1_ps(NPY_COEFF_Q2_LOGf);
    __m256 log_q3 = _mm256_set1_ps(NPY_COEFF_Q3_LOGf);
    __m256 log_q4 = _mm256_set1_ps(NPY_COEFF_Q4_LOGf);
    __m256 log_q5 = _mm256_set1_ps(NPY_COEFF_Q5_LOGf);
    __m256 loge2 = _mm256_set1_ps(NPY_LOGE2f);
    __m256 nan = _mm256_set1_ps(NPY_NANF);
    __m256 neg_nan = _mm256_set1_ps(-NPY_NANF);
    __m256 neg_inf = _mm256_set1_ps(-NPY_INFINITYF);
    __m256 inf = _mm256_set1_ps(NPY_INFINITYF);
    __m256 zeros_f = _mm256_set1_ps(0.0f);
    __m256 ones_f = _mm256_set1_ps(1.0f);
    __m256i vindex = _mm256_loadu_si256((__m256i*)indexarr);
    __m256 poly, num_poly, denom_poly, exponent;

    __m256 inf_mask, nan_mask, sqrt2_mask, zero_mask, negx_mask;
    __m256 invalid_mask = fma_get_partial_load_mask_ps(0, num_lanes);
    __m256 divide_by_zero_mask = invalid_mask;
    __m256 load_mask = fma_get_full_load_mask_ps();
    npy_intp num_remaining_elements = array_size;

    while (num_remaining_elements > 0) {

        if (num_remaining_elements < num_lanes) {
            load_mask = fma_get_partial_load_mask_ps(num_remaining_elements,
                                                       num_lanes);
        }

        __m256 x_in;
        if (stride == 1) {
            x_in = fma_masked_load_ps(load_mask, ip);
        }
        else {
            x_in  = fma_masked_gather_ps(zeros_f, ip, vindex, load_mask);
        }

        negx_mask = _mm256_cmp_ps(x_in, zeros_f, _CMP_LT_OQ);
        zero_mask = _mm256_cmp_ps(x_in, zeros_f, _CMP_EQ_OQ);
        inf_mask = _mm256_cmp_ps(x_in, inf, _CMP_EQ_OQ);
        nan_mask = _mm256_cmp_ps(x_in, x_in, _CMP_NEQ_UQ);
        divide_by_zero_mask = _mm256_or_ps(divide_by_zero_mask,
                                        _mm256_and_ps(zero_mask, load_mask));
        invalid_mask = _mm256_or_ps(invalid_mask, negx_mask);

        __m256 x = fma_set_masked_lanes_ps(x_in, zeros_f, negx_mask);

        /* set x = normalized mantissa */
        exponent = fma_get_exponent(x);
        x = fma_get_mantissa(x);

        /* if x < sqrt(2) {exp = exp-1; x = 2*x} */
        sqrt2_mask = _mm256_cmp_ps(x, _mm256_set1_ps(NPY_SQRT1_2f), _CMP_LE_OQ);
        x = fma_blend(x, _mm256_add_ps(x,x), sqrt2_mask);
        exponent = fma_blend(exponent,
                               _mm256_sub_ps(exponent,ones_f), sqrt2_mask);

        /* x = x - 1 */
        x = _mm256_sub_ps(x, ones_f);

        /* Polynomial approximation for log(1+x) */
        num_poly = _mm256_fmadd_ps(log_p5, x, log_p4);
        num_poly = _mm256_fmadd_ps(num_poly, x, log_p3);
        num_poly = _mm256_fmadd_ps(num_poly, x, log_p2);
        num_poly = _mm256_fmadd_ps(num_poly, x, log_p1);
        num_poly = _mm256_fmadd_ps(num_poly, x, log_p0);
        denom_poly = _mm256_fmadd_ps(log_q5, x, log_q4);
        denom_poly = _mm256_fmadd_ps(denom_poly, x, log_q3);
        denom_poly = _mm256_fmadd_ps(denom_poly, x, log_q2);
        denom_poly = _mm256_fmadd_ps(denom_poly, x, log_q1);
        denom_poly = _mm256_fmadd_ps(denom_poly, x, log_q0);
        poly = _mm256_div_ps(num_poly, denom_poly);
        poly = _mm256_fmadd_ps(exponent, loge2, poly);

        /*
         * x < 0.0f; return -NAN
         * x = +/- NAN; return NAN
         * x = 0.0f; return -INF
         */
        poly = fma_set_masked_lanes_ps(poly, nan, nan_mask);
        poly = fma_set_masked_lanes_ps(poly, neg_nan, negx_mask);
        poly = fma_set_masked_lanes_ps(poly, neg_inf, zero_mask);
        poly = fma_set_masked_lanes_ps(poly, inf, inf_mask);

        _mm256_maskstore_ps(op, _mm256_cvtps_epi32(load_mask), poly);

        ip += num_lanes*stride;
        op += num_lanes;
        num_remaining_elements -= num_lanes;
    }

    if (_mm256_movemask_ps(invalid_mask)) {
        npy_set_floatstatus_invalid();
    }
    if (_mm256_movemask_ps(divide_by_zero_mask)) {
        npy_set_floatstatus_divbyzero();
    }
}
#endif // SIMD_AVX2_FMA3

#line 395
#ifdef SIMD_AVX512F
/*
 * Vectorized Cody-Waite range reduction technique
 * Performs the reduction step x* = x - y*C in three steps:
 * 1) x* = x - y*c1
 * 2) x* = x - y*c2
 * 3) x* = x - y*c3
 * c1, c2 are exact floating points, c3 = C - c1 - c2 simulates higher precision
 */
static NPY_INLINE __m512
simd_range_reduction(__m512 x, __m512 y, __m512 c1, __m512 c2, __m512 c3)
{
    __m512 reduced_x = _mm512_fmadd_ps(y, c1, x);
    reduced_x = _mm512_fmadd_ps(y, c2, reduced_x);
    reduced_x = _mm512_fmadd_ps(y, c3, reduced_x);
    return reduced_x;
}
/*
 * Vectorized implementation of exp using AVX2 and AVX512:
 * 1) if x >= xmax; return INF (overflow)
 * 2) if x <= xmin; return 0.0f (underflow)
 * 3) Range reduction (using Coyd-Waite):
 *      a) y = x - k*ln(2); k = rint(x/ln(2)); y \in [0, ln(2)]
 * 4) Compute exp(y) = P/Q, ratio of 2 polynomials P and Q
 *      b) P = 5th order and Q = 2nd order polynomials obtained from Remez's
 *      algorithm (mini-max polynomial approximation)
 * 5) Compute exp(x) = exp(y) * 2^k
 * 6) Max ULP error measured across all 32-bit FP's = 2.52 (x = 0xc2781e37)
 * 7) Max relative error measured across all 32-bit FP's= 2.1264E-07 (for the
 * same x = 0xc2781e37)
 */
static void
simd_exp_FLOAT(npy_float * op,
                npy_float * ip,
                const npy_intp array_size,
                const npy_intp steps)
{
    const npy_intp stride = steps/(npy_intp)sizeof(npy_float);
    const npy_int num_lanes = 64/(npy_intp)sizeof(npy_float);
    npy_float xmax = 88.72283935546875f;
    npy_float xmin = -103.97208404541015625f;

    /*
     * Note: while generally indices are npy_intp, we ensure that our maximum index
     * will fit in an int32 as a precondition for this function via
     * IS_OUTPUT_BLOCKABLE_UNARY
     */
    npy_int32 indexarr[16];
    for (npy_int32 ii = 0; ii < 16; ii++) {
        indexarr[ii] = ii*stride;
    }

    /* Load up frequently used constants */
    __m512 codyw_c1 = _mm512_set1_ps(NPY_CODY_WAITE_LOGE_2_HIGHf);
    __m512 codyw_c2 = _mm512_set1_ps(NPY_CODY_WAITE_LOGE_2_LOWf);
    __m512 exp_p0 = _mm512_set1_ps(NPY_COEFF_P0_EXPf);
    __m512 exp_p1 = _mm512_set1_ps(NPY_COEFF_P1_EXPf);
    __m512 exp_p2 = _mm512_set1_ps(NPY_COEFF_P2_EXPf);
    __m512 exp_p3 = _mm512_set1_ps(NPY_COEFF_P3_EXPf);
    __m512 exp_p4 = _mm512_set1_ps(NPY_COEFF_P4_EXPf);
    __m512 exp_p5 = _mm512_set1_ps(NPY_COEFF_P5_EXPf);
    __m512 exp_q0 = _mm512_set1_ps(NPY_COEFF_Q0_EXPf);
    __m512 exp_q1 = _mm512_set1_ps(NPY_COEFF_Q1_EXPf);
    __m512 exp_q2 = _mm512_set1_ps(NPY_COEFF_Q2_EXPf);
    __m512 cvt_magic = _mm512_set1_ps(NPY_RINT_CVT_MAGICf);
    __m512 log2e = _mm512_set1_ps(NPY_LOG2Ef);
    __m512 inf = _mm512_set1_ps(NPY_INFINITYF);
    __m512 zeros_f = _mm512_set1_ps(0.0f);
    __m512 poly, num_poly, denom_poly, quadrant;
    __m512i vindex = _mm512_loadu_si512((__m512i*)&indexarr[0]);

    __mmask16 xmax_mask, xmin_mask, nan_mask, inf_mask;
    __mmask16 overflow_mask = avx512_get_partial_load_mask_ps(0, num_lanes);
    __mmask16 underflow_mask = avx512_get_partial_load_mask_ps(0, num_lanes);
    __mmask16 load_mask = avx512_get_full_load_mask_ps();
    npy_intp num_remaining_elements = array_size;

    while (num_remaining_elements > 0) {

        if (num_remaining_elements < num_lanes) {
            load_mask = avx512_get_partial_load_mask_ps(num_remaining_elements,
                                                       num_lanes);
        }

        __m512 x;
        if (stride == 1) {
            x = avx512_masked_load_ps(load_mask, ip);
        }
        else {
            x = avx512_masked_gather_ps(zeros_f, ip, vindex, load_mask);
        }

        nan_mask = _mm512_cmp_ps_mask(x, x, _CMP_NEQ_UQ);
        x = avx512_set_masked_lanes_ps(x, zeros_f, nan_mask);

        xmax_mask = _mm512_cmp_ps_mask(x, _mm512_set1_ps(xmax), _CMP_GE_OQ);
        xmin_mask = _mm512_cmp_ps_mask(x, _mm512_set1_ps(xmin), _CMP_LE_OQ);
        inf_mask = _mm512_cmp_ps_mask(x, inf, _CMP_EQ_OQ);
        overflow_mask = _mm512_kor(overflow_mask,
                                    _mm512_kxor(xmax_mask, inf_mask));
        underflow_mask = _mm512_kor(underflow_mask, xmin_mask);

        x = avx512_set_masked_lanes_ps(x, zeros_f, _mm512_kor(
                                    _mm512_kor(nan_mask, xmin_mask), xmax_mask));

        quadrant = _mm512_mul_ps(x, log2e);

        /* round to nearest */
        quadrant = _mm512_add_ps(quadrant, cvt_magic);
        quadrant = _mm512_sub_ps(quadrant, cvt_magic);

        /* Cody-Waite's range reduction algorithm */
        x = simd_range_reduction(x, quadrant, codyw_c1, codyw_c2, zeros_f);

        num_poly = _mm512_fmadd_ps(exp_p5, x, exp_p4);
        num_poly = _mm512_fmadd_ps(num_poly, x, exp_p3);
        num_poly = _mm512_fmadd_ps(num_poly, x, exp_p2);
        num_poly = _mm512_fmadd_ps(num_poly, x, exp_p1);
        num_poly = _mm512_fmadd_ps(num_poly, x, exp_p0);
        denom_poly = _mm512_fmadd_ps(exp_q2, x, exp_q1);
        denom_poly = _mm512_fmadd_ps(denom_poly, x, exp_q0);
        poly = _mm512_div_ps(num_poly, denom_poly);

        /*
         * compute val = poly * 2^quadrant; which is same as adding the
         * exponent of quadrant to the exponent of poly. quadrant is an int,
         * so extracting exponent is simply extracting 8 bits.
         */
        poly = avx512_scalef_ps(poly, quadrant);

        /*
         * elem > xmax; return inf
         * elem < xmin; return 0.0f
         * elem = +/- nan, return nan
         */
        poly = avx512_set_masked_lanes_ps(poly, _mm512_set1_ps(NPY_NANF), nan_mask);
        poly = avx512_set_masked_lanes_ps(poly, inf, xmax_mask);
        poly = avx512_set_masked_lanes_ps(poly, zeros_f, xmin_mask);

        _mm512_mask_storeu_ps(op, (load_mask), poly);

        ip += num_lanes*stride;
        op += num_lanes;
        num_remaining_elements -= num_lanes;
    }

    if ((overflow_mask)) {
        npy_set_floatstatus_overflow();
    }

    if ((underflow_mask)) {
        npy_set_floatstatus_underflow();
    }
}

/*
 * Vectorized implementation of log using AVX2 and AVX512
 * 1) if x < 0.0f; return -NAN (invalid input)
 * 2) Range reduction: y = x/2^k;
 *      a) y = normalized mantissa, k is the exponent (0.5 <= y < 1)
 * 3) Compute log(y) = P/Q, ratio of 2 polynomials P and Q
 *      b) P = 5th order and Q = 5th order polynomials obtained from Remez's
 *      algorithm (mini-max polynomial approximation)
 * 5) Compute log(x) = log(y) + k*ln(2)
 * 6) Max ULP error measured across all 32-bit FP's = 3.83 (x = 0x3f486945)
 * 7) Max relative error measured across all 32-bit FP's = 2.359E-07 (for same
 * x = 0x3f486945)
 */

static void
simd_log_FLOAT(npy_float * op,
                npy_float * ip,
                const npy_intp array_size,
                const npy_intp steps)
{
    const npy_intp stride = steps/(npy_intp)sizeof(npy_float);
    const npy_int num_lanes = 64/(npy_intp)sizeof(npy_float);

    /*
     * Note: while generally indices are npy_intp, we ensure that our maximum index
     * will fit in an int32 as a precondition for this function via
     * IS_OUTPUT_BLOCKABLE_UNARY
     */
    npy_int32 indexarr[16];
    for (npy_int32 ii = 0; ii < 16; ii++) {
        indexarr[ii] = ii*stride;
    }

    /* Load up frequently used constants */
    __m512 log_p0 = _mm512_set1_ps(NPY_COEFF_P0_LOGf);
    __m512 log_p1 = _mm512_set1_ps(NPY_COEFF_P1_LOGf);
    __m512 log_p2 = _mm512_set1_ps(NPY_COEFF_P2_LOGf);
    __m512 log_p3 = _mm512_set1_ps(NPY_COEFF_P3_LOGf);
    __m512 log_p4 = _mm512_set1_ps(NPY_COEFF_P4_LOGf);
    __m512 log_p5 = _mm512_set1_ps(NPY_COEFF_P5_LOGf);
    __m512 log_q0 = _mm512_set1_ps(NPY_COEFF_Q0_LOGf);
    __m512 log_q1 = _mm512_set1_ps(NPY_COEFF_Q1_LOGf);
    __m512 log_q2 = _mm512_set1_ps(NPY_COEFF_Q2_LOGf);
    __m512 log_q3 = _mm512_set1_ps(NPY_COEFF_Q3_LOGf);
    __m512 log_q4 = _mm512_set1_ps(NPY_COEFF_Q4_LOGf);
    __m512 log_q5 = _mm512_set1_ps(NPY_COEFF_Q5_LOGf);
    __m512 loge2 = _mm512_set1_ps(NPY_LOGE2f);
    __m512 nan = _mm512_set1_ps(NPY_NANF);
    __m512 neg_nan = _mm512_set1_ps(-NPY_NANF);
    __m512 neg_inf = _mm512_set1_ps(-NPY_INFINITYF);
    __m512 inf = _mm512_set1_ps(NPY_INFINITYF);
    __m512 zeros_f = _mm512_set1_ps(0.0f);
    __m512 ones_f = _mm512_set1_ps(1.0f);
    __m512i vindex = _mm512_loadu_si512((__m512i*)indexarr);
    __m512 poly, num_poly, denom_poly, exponent;

    __mmask16 inf_mask, nan_mask, sqrt2_mask, zero_mask, negx_mask;
    __mmask16 invalid_mask = avx512_get_partial_load_mask_ps(0, num_lanes);
    __mmask16 divide_by_zero_mask = invalid_mask;
    __mmask16 load_mask = avx512_get_full_load_mask_ps();
    npy_intp num_remaining_elements = array_size;

    while (num_remaining_elements > 0) {

        if (num_remaining_elements < num_lanes) {
            load_mask = avx512_get_partial_load_mask_ps(num_remaining_elements,
                                                       num_lanes);
        }

        __m512 x_in;
        if (stride == 1) {
            x_in = avx512_masked_load_ps(load_mask, ip);
        }
        else {
            x_in  = avx512_masked_gather_ps(zeros_f, ip, vindex, load_mask);
        }

        negx_mask = _mm512_cmp_ps_mask(x_in, zeros_f, _CMP_LT_OQ);
        zero_mask = _mm512_cmp_ps_mask(x_in, zeros_f, _CMP_EQ_OQ);
        inf_mask = _mm512_cmp_ps_mask(x_in, inf, _CMP_EQ_OQ);
        nan_mask = _mm512_cmp_ps_mask(x_in, x_in, _CMP_NEQ_UQ);
        divide_by_zero_mask = _mm512_kor(divide_by_zero_mask,
                                        _mm512_kand(zero_mask, load_mask));
        invalid_mask = _mm512_kor(invalid_mask, negx_mask);

        __m512 x = avx512_set_masked_lanes_ps(x_in, zeros_f, negx_mask);

        /* set x = normalized mantissa */
        exponent = avx512_get_exponent(x);
        x = avx512_get_mantissa(x);

        /* if x < sqrt(2) {exp = exp-1; x = 2*x} */
        sqrt2_mask = _mm512_cmp_ps_mask(x, _mm512_set1_ps(NPY_SQRT1_2f), _CMP_LE_OQ);
        x = avx512_blend(x, _mm512_add_ps(x,x), sqrt2_mask);
        exponent = avx512_blend(exponent,
                               _mm512_sub_ps(exponent,ones_f), sqrt2_mask);

        /* x = x - 1 */
        x = _mm512_sub_ps(x, ones_f);

        /* Polynomial approximation for log(1+x) */
        num_poly = _mm512_fmadd_ps(log_p5, x, log_p4);
        num_poly = _mm512_fmadd_ps(num_poly, x, log_p3);
        num_poly = _mm512_fmadd_ps(num_poly, x, log_p2);
        num_poly = _mm512_fmadd_ps(num_poly, x, log_p1);
        num_poly = _mm512_fmadd_ps(num_poly, x, log_p0);
        denom_poly = _mm512_fmadd_ps(log_q5, x, log_q4);
        denom_poly = _mm512_fmadd_ps(denom_poly, x, log_q3);
        denom_poly = _mm512_fmadd_ps(denom_poly, x, log_q2);
        denom_poly = _mm512_fmadd_ps(denom_poly, x, log_q1);
        denom_poly = _mm512_fmadd_ps(denom_poly, x, log_q0);
        poly = _mm512_div_ps(num_poly, denom_poly);
        poly = _mm512_fmadd_ps(exponent, loge2, poly);

        /*
         * x < 0.0f; return -NAN
         * x = +/- NAN; return NAN
         * x = 0.0f; return -INF
         */
        poly = avx512_set_masked_lanes_ps(poly, nan, nan_mask);
        poly = avx512_set_masked_lanes_ps(poly, neg_nan, negx_mask);
        poly = avx512_set_masked_lanes_ps(poly, neg_inf, zero_mask);
        poly = avx512_set_masked_lanes_ps(poly, inf, inf_mask);

        _mm512_mask_storeu_ps(op, (load_mask), poly);

        ip += num_lanes*stride;
        op += num_lanes;
        num_remaining_elements -= num_lanes;
    }

    if ((invalid_mask)) {
        npy_set_floatstatus_invalid();
    }
    if ((divide_by_zero_mask)) {
        npy_set_floatstatus_divbyzero();
    }
}
#endif // SIMD_AVX512F


#ifdef SIMD_AVX512F_NOCLANG_BUG
/*
 * Vectorized implementation of exp double using AVX512
 * Reference: Tang, P.T.P., "Table-driven implementation of the
 *  exponential function in IEEE floating-point
 *  arithmetic," ACM Transactions on Mathematical
 *  Software, vol. 15, pp. 144-157, 1989.
 * 1) if x > mTH_max or x is INF; return INF (overflow)
 * 2) if x < mTH_min; return 0.0f (underflow)
 * 3) if abs(x) < mTH_nearzero; return 1.0f + x
 * 4) if x is Nan; return Nan
 * 5) Range reduction:
 *    x = (32m + j)ln2 / 32 + r; r in [-ln2/64, ln2/64]
 * 6) exp(r) - 1 is approximated by a polynomial function p(r)
 *    exp(x) = 2^m(2^(j/32) + 2^(j/32)p(r));
 */
static void
AVX512F_exp_DOUBLE(npy_double * op,
                npy_double * ip,
                const npy_intp array_size,
                const npy_intp steps)
{
    npy_intp num_remaining_elements = array_size;
    const npy_intp stride = steps / (npy_intp)sizeof(npy_double);
    const npy_int num_lanes = 64 / (npy_intp)sizeof(npy_double);
    npy_int32 indexarr[8];
    for (npy_int32 ii = 0; ii < 8; ii++) {
        indexarr[ii] = ii*stride;
    }

    __m512d InvLn2N = _mm512_set1_pd(NPY_INV_LN2_MUL_32);
    __m512d mShift = _mm512_set1_pd(NPY_RINT_CVT_MAGIC);
    __m512d mNegL1 = _mm512_set1_pd(NPY_TANG_NEG_L1);
    __m512d mNegL2 = _mm512_set1_pd(NPY_TANG_NEG_L2);
    __m512i mMod = _mm512_set1_epi64(0x1f);
    __m512d mA1 = _mm512_set1_pd(NPY_TANG_A1);
    __m512d mA2 = _mm512_set1_pd(NPY_TANG_A2);
    __m512d mA3 = _mm512_set1_pd(NPY_TANG_A3);
    __m512d mA4 = _mm512_set1_pd(NPY_TANG_A4);
    __m512d mA5 = _mm512_set1_pd(NPY_TANG_A5);
    __m512d mTH_nearzero = _mm512_set1_pd(0x1p-54);
    __m512d mTH_max = _mm512_set1_pd(0x1.62e42fefa39efp+9);
    __m512d mTH_min = _mm512_set1_pd(-0x1.74910d52d3053p+9);
    __m512d mTH_inf = _mm512_set1_pd(NPY_INFINITY);
    __m512d zeros_d = _mm512_set1_pd(0.0f);
    __m512d ones_d = _mm512_set1_pd(1.0f);
    __m256i vindex = _mm256_loadu_si256((__m256i*)&indexarr[0]);

    __m512d mTable_top_0 = _mm512_loadu_pd(&(EXP_Table_top[8*0]));
    __m512d mTable_top_1 = _mm512_loadu_pd(&(EXP_Table_top[8*1]));
    __m512d mTable_top_2 = _mm512_loadu_pd(&(EXP_Table_top[8*2]));
    __m512d mTable_top_3 = _mm512_loadu_pd(&(EXP_Table_top[8*3]));
    __m512d mTable_tail_0 = _mm512_loadu_pd(&(EXP_Table_tail[8*0]));
    __m512d mTable_tail_1 = _mm512_loadu_pd(&(EXP_Table_tail[8*1]));
    __m512d mTable_tail_2 = _mm512_loadu_pd(&(EXP_Table_tail[8*2]));
    __m512d mTable_tail_3 = _mm512_loadu_pd(&(EXP_Table_tail[8*3]));

    __mmask8 overflow_mask = avx512_get_partial_load_mask_pd(0, num_lanes);
    __mmask8 underflow_mask = avx512_get_partial_load_mask_pd(0, num_lanes);
    __mmask8 load_mask = avx512_get_full_load_mask_pd();
    __mmask8 xmin_mask, xmax_mask, inf_mask, nan_mask, nearzero_mask;

    while (num_remaining_elements > 0) {
        if (num_remaining_elements < num_lanes) {
            load_mask = avx512_get_partial_load_mask_pd(num_remaining_elements,
                                                      num_lanes);
        }

        __m512d x;
        if (1 == stride) {
            x = avx512_masked_load_pd(load_mask, ip);
        }
        else {
            x = avx512_masked_gather_pd(zeros_d, ip, vindex, load_mask);
        }

        nan_mask = _mm512_cmp_pd_mask(x, x, _CMP_NEQ_UQ);
        x = avx512_set_masked_lanes_pd(x, zeros_d, nan_mask);
        xmax_mask = _mm512_cmp_pd_mask(x, mTH_max, _CMP_GT_OQ);
        xmin_mask = _mm512_cmp_pd_mask(x, mTH_min, _CMP_LT_OQ);
        inf_mask = _mm512_cmp_pd_mask(x, mTH_inf, _CMP_EQ_OQ);
        __m512i x_abs = _mm512_and_epi64(_mm512_castpd_si512(x),
                                _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF));
        nearzero_mask = _mm512_cmp_pd_mask(_mm512_castsi512_pd(x_abs),
                                    mTH_nearzero, _CMP_LT_OQ);
        nearzero_mask = _mm512_kxor(nearzero_mask, nan_mask);
        overflow_mask = _mm512_kor(overflow_mask,
                                _mm512_kxor(xmax_mask, inf_mask));
        underflow_mask = _mm512_kor(underflow_mask, xmin_mask);
        x = avx512_set_masked_lanes_pd(x, zeros_d,
                        _mm512_kor(_mm512_kor(nan_mask, xmin_mask),
                            _mm512_kor(xmax_mask, nearzero_mask)));

        /* z = x * 32/ln2 */
        __m512d z = _mm512_mul_pd(x, InvLn2N);

        /* round to nearest */
        __m512d kd = _mm512_add_pd(z, mShift);
        __m512i ki = _mm512_castpd_si512(kd);
        kd = _mm512_sub_pd(kd, mShift);

        /* r = (x + kd*mNegL1) + kd*mNegL2 */
        __m512d r1 = _mm512_fmadd_pd(kd, mNegL1, x);
        __m512d r2 = _mm512_mul_pd(kd, mNegL2);
        __m512d r = _mm512_add_pd(r1,r2);

        /* Polynomial approximation for exp(r) - 1 */
        __m512d q = _mm512_fmadd_pd(mA5, r, mA4);
        q = _mm512_fmadd_pd(q, r, mA3);
        q = _mm512_fmadd_pd(q, r, mA2);
        q = _mm512_fmadd_pd(q, r, mA1);
        q = _mm512_mul_pd(q, r);
        __m512d p = _mm512_fmadd_pd(r, q, r2);;
        p = _mm512_add_pd(r1, p);

        /* Get 2^(j/32) from lookup table */
        __m512i j = _mm512_and_epi64(ki, mMod);
        __m512d top = avx512_permute_x4var_pd(mTable_top_0, mTable_top_1,
                                  mTable_top_2, mTable_top_3, j);
        __m512d tail = avx512_permute_x4var_pd(mTable_tail_0, mTable_tail_1,
                                  mTable_tail_2, mTable_tail_3, j);

        /*
         * s = top + tail;
         * exp(x) = 2^m * (top + (tail + s * p));
         */
        __m512d s = _mm512_add_pd(top, tail);
        __m512d res = _mm512_fmadd_pd(s, p, tail);
        res = _mm512_add_pd(res, top);
        res= _mm512_scalef_pd(res, _mm512_div_pd(kd, _mm512_set1_pd(32)));

        /* return special cases */
        res = avx512_set_masked_lanes_pd(res, _mm512_add_pd(x, ones_d),
                                        nearzero_mask);
        res = avx512_set_masked_lanes_pd(res, _mm512_set1_pd(NPY_NAN),
                                        nan_mask);
        res = avx512_set_masked_lanes_pd(res, mTH_inf, xmax_mask);
        res = avx512_set_masked_lanes_pd(res, zeros_d, xmin_mask);

        _mm512_mask_storeu_pd(op, load_mask, res);

        ip += num_lanes * stride;
        op += num_lanes;
        num_remaining_elements -= num_lanes;
    }
    if (overflow_mask) {
        npy_set_floatstatus_overflow();
    }

    if (underflow_mask) {
        npy_set_floatstatus_underflow();
    }
}
/*
 * Vectorized implementation of log double using AVX512
 * Reference:
 * [1] Tang, Ping Tak Peter. Table-lookup algorithms for elementary functions
 *     and their error analysis. No. CONF-9106103-1. Argonne National Lab.,
 *     IL (USA), 1991.
 * [2] Tang, Ping-Tak Peter. "Table-driven implementation of the logarithm
 *     function in IEEE floating-point arithmetic." ACM Transactions on
 *     Mathematical Software (TOMS) 16.4 (1990): 378-400.
 * [3] Muller, Jean-Michel. "Elementary functions: algorithms and
 *     implementation." (2016).
 * 1) if x = 0; return -INF
 * 2) if x < 0; return NAN
 * 3) if x is INF; return INF
 * 4) if x is NAN; return NAN
 * 5) if x on (1.0 - 0x1p-4, 1.0 + 0x1.09p-4), calling npy_log()
 * 6) Range reduction:
 *    log(x) = log(2^m * z)
 *           = mln2 + log(z)
 * 7) log(z) = log(z / c_k) + log(c_k);
 *    where c_k = 1 + k/64, k = 0,1,...,64
 *    s.t. |x - c_k| <= 1/128 when x on[1,2].
 * 8) r = 2(x - c_k)/(x + c_k)
 *    log(x/c_k) = log((1 + r/2) / (1 - r/2))
 *               = p(r)
 *               = 2((r/2) + 1/3*(r/2)^3 + 1/5*(r/2)^5 + ...)
 */
static void
AVX512F_log_DOUBLE(npy_double * op,
                npy_double * ip,
                const npy_intp array_size,
                const npy_intp steps)
{
    npy_intp num_remaining_elements = array_size;
    const npy_intp stride = steps / (npy_intp)sizeof(npy_double);
    const npy_int num_lanes = 64 / (npy_intp)sizeof(npy_double);
    npy_int32 indexarr[8];
    for (npy_int32 ii = 0; ii < 8; ii++) {
        indexarr[ii] = ii*stride;
    }

    __m512d zeros_d = _mm512_set1_pd(0.0f);
    __m512d ones_d = _mm512_set1_pd(1.0f);
    __m512d mInf = _mm512_set1_pd(NPY_INFINITY);
    __m512d mInv64 = _mm512_castsi512_pd(_mm512_set1_epi64(0x3f90000000000000));
    __m512d mNeg_nan = _mm512_set1_pd(-NPY_NAN);
    __m512d mNan = _mm512_set1_pd(NPY_NAN);
    __m512d mNeg_inf = _mm512_set1_pd(-NPY_INFINITY);
    __m512d mA1 = _mm512_set1_pd(NPY_TANG_LOG_A1);
    __m512d mA2 = _mm512_set1_pd(NPY_TANG_LOG_A2);
    __m512d mA3 = _mm512_set1_pd(NPY_TANG_LOG_A3);
    __m512d mA4 = _mm512_set1_pd(NPY_TANG_LOG_A4);
    __m512d mLN2HI = _mm512_set1_pd(NPY_TANG_LOG_LN2HI);
    __m512d mLN2LO = _mm512_set1_pd(NPY_TANG_LOG_LN2LO);

    __m512d mTo_glibc_min = _mm512_set1_pd(1.0 - 0x1p-4);
    __m512d mTo_glibc_max = _mm512_set1_pd(1.0 + 0x1.09p-4);
    __m256i vindex = _mm256_loadu_si256((__m256i*)&indexarr[0]);

    /* Load lookup table data */
    #line 907

    __m512d mLUT_TOP_0 = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*0]));
    __m512d mLUT_TAIL_0 = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*0]));

    
#line 907

    __m512d mLUT_TOP_1 = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*1]));
    __m512d mLUT_TAIL_1 = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*1]));

    
#line 907

    __m512d mLUT_TOP_2 = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*2]));
    __m512d mLUT_TAIL_2 = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*2]));

    
#line 907

    __m512d mLUT_TOP_3 = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*3]));
    __m512d mLUT_TAIL_3 = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*3]));

    
#line 907

    __m512d mLUT_TOP_4 = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*4]));
    __m512d mLUT_TAIL_4 = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*4]));

    
#line 907

    __m512d mLUT_TOP_5 = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*5]));
    __m512d mLUT_TAIL_5 = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*5]));

    
#line 907

    __m512d mLUT_TOP_6 = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*6]));
    __m512d mLUT_TAIL_6 = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*6]));

    
#line 907

    __m512d mLUT_TOP_7 = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*7]));
    __m512d mLUT_TAIL_7 = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*7]));

    

    __mmask8 load_mask = avx512_get_full_load_mask_pd();
    __mmask8 invalid_mask = avx512_get_partial_load_mask_pd(0, num_lanes);
    __mmask8 divide_by_zero_mask = invalid_mask;

    __mmask8 inf_mask, nan_mask, zero_mask, negx_mask, denormal_mask,
             glibc_mask;

    __m512d x_in;
    while (num_remaining_elements > 0) {
        if (num_remaining_elements < num_lanes) {
            load_mask = avx512_get_partial_load_mask_pd(num_remaining_elements,
                                                      num_lanes);
        }

        if (1 == stride) {
            x_in = avx512_masked_load_pd(load_mask, ip);
        }
        else {
            x_in = avx512_masked_gather_pd(zeros_d, ip, vindex, load_mask);
        }

        /* call glibc when x on [1.0 - 0x1p-4, 1.0 + 0x1.09p-4] */
        __mmask8 m1 = _mm512_cmp_pd_mask(x_in, mTo_glibc_max, _CMP_LT_OQ);
        __mmask8 m2 = _mm512_cmp_pd_mask(x_in, mTo_glibc_min, _CMP_GT_OQ);
        glibc_mask =  m1 & m2;

        if (glibc_mask != 0xFF) {
            zero_mask = _mm512_cmp_pd_mask(x_in, zeros_d, _CMP_EQ_OQ);
            inf_mask = _mm512_cmp_pd_mask(x_in, mInf, _CMP_EQ_OQ);
            negx_mask = _mm512_cmp_pd_mask(x_in, zeros_d, _CMP_LT_OQ);
            nan_mask = _mm512_cmp_pd_mask(x_in, x_in, _CMP_NEQ_UQ);

            divide_by_zero_mask = divide_by_zero_mask | (zero_mask & load_mask);
            invalid_mask = invalid_mask | negx_mask;

            __m512d x = avx512_set_masked_lanes_pd(x_in, zeros_d, negx_mask);
            __m512i ix = _mm512_castpd_si512(x);

            /* Normalize x when it is denormal */
            __m512i top12 = _mm512_and_epi64(ix,
                                _mm512_set1_epi64(0xfff0000000000000));
            denormal_mask = _mm512_cmp_epi64_mask(top12, _mm512_set1_epi64(0),
                                _CMP_EQ_OQ);
            denormal_mask = (~zero_mask) & denormal_mask;
            ix = _mm512_castpd_si512(_mm512_mask_mul_pd(x, denormal_mask,
                                    x, _mm512_set1_pd(0x1p52)));
            ix = _mm512_mask_sub_epi64(ix, denormal_mask,
                                    ix, _mm512_set1_epi64(52ULL << 52));

            /*
             * x = 2^k * z; where z in range [1,2]
             */
            __m512i tmp = _mm512_sub_epi64(ix,
                              _mm512_set1_epi64(0x3ff0000000000000));
            __m512i i = _mm512_and_epi64(_mm512_srai_epi64(tmp, 52 - 6),
                            _mm512_set1_epi64(0x3fULL));
            __m512i ik = _mm512_srai_epi64(tmp, 52);
            __m512d z = _mm512_castsi512_pd(_mm512_sub_epi64(ix, _mm512_and_epi64(tmp,
                            _mm512_set1_epi64(0xfff0000000000000))));
            /* c = i/64 + 1 */
            __m256i i_32 = _mm512_cvtepi64_epi32(i);
            __m512d c = _mm512_fmadd_pd(_mm512_cvtepi32_pd(i_32), mInv64, ones_d);

            /* u = 2 * (z - c) / (z + c) */
            __m512d u = _mm512_div_pd(_mm512_sub_pd(z, c), _mm512_add_pd(z, c));
            u = _mm512_mul_pd(_mm512_set1_pd(2.0), u);

            /* v = u * u */
            __m512d v = _mm512_mul_pd(u,u);

            /* log(z/c) = u + u*v*(A1 + v*(A2 + v*(A3 + v*A4))) */
            __m512d res = _mm512_fmadd_pd(v, mA4, mA3);
            res = _mm512_fmadd_pd(v, res, mA2);
            res = _mm512_fmadd_pd(v, res, mA1);
            res = _mm512_mul_pd(v, res);
            res = _mm512_fmadd_pd(u, res, u);

            /* Load lookup table data */
            __m512d c_hi = avx512_permute_x8var_pd(mLUT_TOP_0, mLUT_TOP_1,
                            mLUT_TOP_2, mLUT_TOP_3, mLUT_TOP_4, mLUT_TOP_5,
                            mLUT_TOP_6, mLUT_TOP_7, i);
            __m512d c_lo = avx512_permute_x8var_pd(mLUT_TAIL_0, mLUT_TAIL_1,
                              mLUT_TAIL_2, mLUT_TAIL_3, mLUT_TAIL_4, mLUT_TAIL_5,
                              mLUT_TAIL_6, mLUT_TAIL_7, i);

            /*
             * log(x) = k * ln2_hi + c_hi +
             *          k * ln2_lo + c_lo +
             *          log(z/c)
             */
            __m256i ik_32 = _mm512_cvtepi64_epi32(ik);
            __m512d k = _mm512_cvtepi32_pd(ik_32);
            __m512d tt = _mm512_fmadd_pd(k, mLN2HI, c_hi);
            __m512d tt2 = _mm512_fmadd_pd(k, mLN2LO, c_lo);
            tt = _mm512_add_pd(tt, tt2);
            res = _mm512_add_pd(tt, res);

            /* return special cases */
            res = avx512_set_masked_lanes_pd(res, mNan, nan_mask);
            res = avx512_set_masked_lanes_pd(res, mNeg_nan, negx_mask);
            res = avx512_set_masked_lanes_pd(res, mNeg_inf, zero_mask);
            res = avx512_set_masked_lanes_pd(res, mInf, inf_mask);

            _mm512_mask_storeu_pd(op, load_mask, res);
        }

        /* call glibc's log func when x around 1.0f */
        if (glibc_mask != 0) {
            double NPY_DECL_ALIGNED(64) ip_fback[8];
            _mm512_store_pd(ip_fback, x_in);

            for (int ii = 0; ii < 8; ++ii, glibc_mask >>= 1) {
                if (glibc_mask & 0x01) {
                    op[ii] = npy_log(ip_fback[ii]);
                }
            }
        }
        ip += num_lanes * stride;
        op += num_lanes;
        num_remaining_elements -= num_lanes;
    }

    if (invalid_mask) {
        npy_set_floatstatus_invalid();
    }
    if (divide_by_zero_mask) {
        npy_set_floatstatus_divbyzero();
    }
}
#endif // AVX512F_NOCLANG_BUG

#ifdef SIMD_AVX512_SKX
#line 1063
static NPY_INLINE void
AVX512_SKX_ldexp_FLOAT(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
    const npy_intp stride_ip1 = steps[0]/(npy_intp)sizeof(npy_float);
    const npy_intp stride_ip2 = steps[1]/(npy_intp)sizeof(int);
    const npy_intp stride_op = steps[2]/(npy_intp)sizeof(npy_float);
    const npy_intp array_size = dimensions[0];
    npy_intp num_remaining_elements = array_size;
    npy_float* ip1 = (npy_float*) args[0];
    int* ip2 = (int*) args[1];
    npy_float* op  = (npy_float*) args[2];

    __mmask16 load_mask = avx512_get_full_load_mask_ps();

    /*
     * Note: while generally indices are npy_intp, we ensure that our maximum index
     * will fit in an int32 as a precondition for this function via
     * IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP
     */

    npy_int32 index_ip1[16], index_ip2[16], index_op[16];
    for (npy_int32 ii = 0; ii < 16; ii++) {
        index_ip1[ii] = ii*stride_ip1;
        index_ip2[ii] = ii*stride_ip2;
        index_op[ii] = ii*stride_op;
    }
    __m512i vindex_ip1 = _mm512_loadu_si512((__m512i*)&index_ip1[0]);
    __m512i vindex_ip2 = _mm512_loadu_si512((__m512i*)&index_ip2[0]);
    __m512i vindex_op  = _mm512_loadu_si512((__m512i*)&index_op[0]);
    __m512 zeros_f = _mm512_setzero_ps();
    __m512i zeros = _mm512_setzero_epi32();

    while (num_remaining_elements > 0) {
        if (num_remaining_elements < 16) {
            load_mask = avx512_get_partial_load_mask_ps(
                                    num_remaining_elements, 16);
        }
        __m512 x1;
        __m512i x2;
        if (stride_ip1 == 1) {
            x1 = avx512_masked_load_ps(load_mask, ip1);
        }
        else {
            x1 = avx512_masked_gather_ps(zeros_f, ip1, vindex_ip1, load_mask);
        }
        if (stride_ip2 == 1) {
            x2 = _mm512_maskz_loadu_epi32(load_mask, ip2);
        }
        else {
            x2 = _mm512_mask_i32gather_epi32(zeros, load_mask, vindex_ip2, ip2, 4);
        }

        __m512 out = _mm512_scalef_ps(x1, _mm512_cvtepi32_ps(x2));

        if (stride_op == 1) {
            _mm512_mask_storeu_ps(op, load_mask, out);
        }
        else {
            /* scatter! */
            _mm512_mask_i32scatter_ps(op, load_mask, vindex_op, out, 4);
        }

        ip1 += 16*stride_ip1;
        ip2 += 16*stride_ip2;
        op += 16*stride_op;
        num_remaining_elements -= 16;
    }
}

static NPY_INLINE void
AVX512_SKX_frexp_FLOAT(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
    const npy_intp stride_ip1 = steps[0]/(npy_intp)sizeof(npy_float);
    const npy_intp stride_op1 = steps[1]/(npy_intp)sizeof(npy_float);
    const npy_intp stride_op2 = steps[2]/(npy_intp)sizeof(int);
    const npy_intp array_size = dimensions[0];
    npy_intp num_remaining_elements = array_size;
    npy_float* ip1 = (npy_float*) args[0];
    npy_float* op1  = (npy_float*) args[1];
    int* op2 = (int*) args[2];

    __mmask16 load_mask = avx512_get_full_load_mask_ps();

    /*
     * Note: while generally indices are npy_intp, we ensure that our maximum index
     * will fit in an int32 as a precondition for this function via
     * IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP
     */

    npy_int32 index_ip1[16], index_op1[16], index_op2[16];
    for (npy_int32 ii = 0; ii < 16; ii++) {
        index_ip1[ii] = ii*stride_ip1;
        index_op1[ii] = ii*stride_op1;
        index_op2[ii] = ii*stride_op2;
    }
    __m512i vindex_ip1 = _mm512_loadu_si512((__m512i*)&index_ip1[0]);
    __m512i vindex_op1 = _mm512_loadu_si512((__m512i*)&index_op1[0]);
    __m512i vindex_op2 = _mm512_loadu_si512((__m512i*)&index_op2[0]);
    __m512 zeros_f = _mm512_setzero_ps();

    while (num_remaining_elements > 0) {
        if (num_remaining_elements < 16) {
            load_mask = avx512_get_partial_load_mask_ps(
                                    num_remaining_elements, 16);
        }
        __m512 x1;
        if (stride_ip1 == 1) {
            x1 = avx512_masked_load_ps(load_mask, ip1);
        }
        else {
            x1 = avx512_masked_gather_ps(zeros_f, ip1, vindex_ip1, load_mask);
        }

        /*
         * The x86 instructions vpgetmant and vpgetexp do not conform
         * with NumPy's output for special floating points: NAN, +/-INF, +/-0.0
         * We mask these values with spmask to avoid invalid exceptions.
         */
        __mmask16 spmask =_mm512_knot(_mm512_fpclass_ps_mask(
                                                x1, 0b10011111));
        __m512 out1 = _mm512_maskz_getmant_ps(
                                spmask, x1, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src);
        out1 = _mm512_mask_mov_ps(x1, spmask, out1);
        __m512i out2 = _mm512_cvtps_epi32(
                            _mm512_maskz_add_ps(spmask, _mm512_set1_ps(1.0),
                                _mm512_maskz_getexp_ps(spmask, x1)));
        if (stride_op1 == 1) {
            _mm512_mask_storeu_ps(op1, load_mask, out1);
        }
        else {
            _mm512_mask_i32scatter_ps(op1, load_mask, vindex_op1, out1, 4);
        }
        if (stride_op2 == 1) {
            _mm512_mask_storeu_epi32(op2, load_mask, out2);
        }
        else {
            _mm512_mask_i32scatter_epi32(op2, load_mask, vindex_op2, out2, 4);
        }

        ip1 += 16*stride_ip1;
        op1 += 16*stride_op1;
        op2 += 16*stride_op2;
        num_remaining_elements -= 16;
    }
}

#line 1063
static NPY_INLINE void
AVX512_SKX_ldexp_DOUBLE(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
    const npy_intp stride_ip1 = steps[0]/(npy_intp)sizeof(npy_double);
    const npy_intp stride_ip2 = steps[1]/(npy_intp)sizeof(int);
    const npy_intp stride_op = steps[2]/(npy_intp)sizeof(npy_double);
    const npy_intp array_size = dimensions[0];
    npy_intp num_remaining_elements = array_size;
    npy_double* ip1 = (npy_double*) args[0];
    int* ip2 = (int*) args[1];
    npy_double* op  = (npy_double*) args[2];

    __mmask8 load_mask = avx512_get_full_load_mask_pd();

    /*
     * Note: while generally indices are npy_intp, we ensure that our maximum index
     * will fit in an int32 as a precondition for this function via
     * IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP
     */

    npy_int32 index_ip1[8], index_ip2[8], index_op[8];
    for (npy_int32 ii = 0; ii < 8; ii++) {
        index_ip1[ii] = ii*stride_ip1;
        index_ip2[ii] = ii*stride_ip2;
        index_op[ii] = ii*stride_op;
    }
    __m256i vindex_ip1 = _mm256_loadu_si256((__m256i*)&index_ip1[0]);
    __m256i vindex_ip2 = _mm256_loadu_si256((__m256i*)&index_ip2[0]);
    __m256i vindex_op  = _mm256_loadu_si256((__m256i*)&index_op[0]);
    __m512d zeros_f = _mm512_setzero_pd();
    __m256i zeros = _mm256_setzero_si256();

    while (num_remaining_elements > 0) {
        if (num_remaining_elements < 8) {
            load_mask = avx512_get_partial_load_mask_pd(
                                    num_remaining_elements, 8);
        }
        __m512d x1;
        __m256i x2;
        if (stride_ip1 == 1) {
            x1 = avx512_masked_load_pd(load_mask, ip1);
        }
        else {
            x1 = avx512_masked_gather_pd(zeros_f, ip1, vindex_ip1, load_mask);
        }
        if (stride_ip2 == 1) {
            x2 = _mm256_maskz_loadu_epi32(load_mask, ip2);
        }
        else {
            x2 = _mm256_mmask_i32gather_epi32(zeros, load_mask, vindex_ip2, ip2, 4);
        }

        __m512d out = _mm512_scalef_pd(x1, _mm512_cvtepi32_pd(x2));

        if (stride_op == 1) {
            _mm512_mask_storeu_pd(op, load_mask, out);
        }
        else {
            /* scatter! */
            _mm512_mask_i32scatter_pd(op, load_mask, vindex_op, out, 8);
        }

        ip1 += 8*stride_ip1;
        ip2 += 8*stride_ip2;
        op += 8*stride_op;
        num_remaining_elements -= 8;
    }
}

static NPY_INLINE void
AVX512_SKX_frexp_DOUBLE(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
    const npy_intp stride_ip1 = steps[0]/(npy_intp)sizeof(npy_double);
    const npy_intp stride_op1 = steps[1]/(npy_intp)sizeof(npy_double);
    const npy_intp stride_op2 = steps[2]/(npy_intp)sizeof(int);
    const npy_intp array_size = dimensions[0];
    npy_intp num_remaining_elements = array_size;
    npy_double* ip1 = (npy_double*) args[0];
    npy_double* op1  = (npy_double*) args[1];
    int* op2 = (int*) args[2];

    __mmask8 load_mask = avx512_get_full_load_mask_pd();

    /*
     * Note: while generally indices are npy_intp, we ensure that our maximum index
     * will fit in an int32 as a precondition for this function via
     * IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP
     */

    npy_int32 index_ip1[8], index_op1[8], index_op2[8];
    for (npy_int32 ii = 0; ii < 8; ii++) {
        index_ip1[ii] = ii*stride_ip1;
        index_op1[ii] = ii*stride_op1;
        index_op2[ii] = ii*stride_op2;
    }
    __m256i vindex_ip1 = _mm256_loadu_si256((__m256i*)&index_ip1[0]);
    __m256i vindex_op1 = _mm256_loadu_si256((__m256i*)&index_op1[0]);
    __m256i vindex_op2 = _mm256_loadu_si256((__m256i*)&index_op2[0]);
    __m512d zeros_f = _mm512_setzero_pd();

    while (num_remaining_elements > 0) {
        if (num_remaining_elements < 8) {
            load_mask = avx512_get_partial_load_mask_pd(
                                    num_remaining_elements, 8);
        }
        __m512d x1;
        if (stride_ip1 == 1) {
            x1 = avx512_masked_load_pd(load_mask, ip1);
        }
        else {
            x1 = avx512_masked_gather_pd(zeros_f, ip1, vindex_ip1, load_mask);
        }

        /*
         * The x86 instructions vpgetmant and vpgetexp do not conform
         * with NumPy's output for special floating points: NAN, +/-INF, +/-0.0
         * We mask these values with spmask to avoid invalid exceptions.
         */
        __mmask8 spmask =_mm512_knot(_mm512_fpclass_pd_mask(
                                                x1, 0b10011111));
        __m512d out1 = _mm512_maskz_getmant_pd(
                                spmask, x1, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src);
        out1 = _mm512_mask_mov_pd(x1, spmask, out1);
        __m256i out2 = _mm512_cvtpd_epi32(
                            _mm512_maskz_add_pd(spmask, _mm512_set1_pd(1.0),
                                _mm512_maskz_getexp_pd(spmask, x1)));
        if (stride_op1 == 1) {
            _mm512_mask_storeu_pd(op1, load_mask, out1);
        }
        else {
            _mm512_mask_i32scatter_pd(op1, load_mask, vindex_op1, out1, 8);
        }
        if (stride_op2 == 1) {
            _mm256_mask_storeu_epi32(op2, load_mask, out2);
        }
        else {
            _mm256_mask_i32scatter_epi32(op2, load_mask, vindex_op2, out2, 4);
        }

        ip1 += 8*stride_ip1;
        op1 += 8*stride_op1;
        op2 += 8*stride_op2;
        num_remaining_elements -= 8;
    }
}

#endif // SIMD_AVX512_SKX


/********************************************************************************
 ** Defining ufunc inner functions
 ********************************************************************************/
#line 1219
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_exp)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
#if defined(SIMD_AVX2_FMA3) || defined(SIMD_AVX512F)
    // third arg in `IS_OUTPUT_BLOCKABLE_UNARY` is dummy
    // TODO: get ride of this macro during the move to NPYV
    if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_float), sizeof(npy_float), 64)) {
        simd_exp_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], steps[0]);
    }
    else {
        UNARY_LOOP {
            /*
             * We use the AVX function to compute exp/log for scalar elements as well.
             * This is needed to ensure the output of strided and non-strided
             * cases match. SIMD code handles strided input cases, but not
             * strided output.
             */
            simd_exp_FLOAT((npy_float *)op1, (npy_float *)ip1, 1, steps[0]);
        }
    }
#else
    UNARY_LOOP {
        const npy_float in1 = *(npy_float *)ip1;
        *(npy_float *)op1 = npy_expf(in1);
    }
#endif
}

#line 1219
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_log)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
#if defined(SIMD_AVX2_FMA3) || defined(SIMD_AVX512F)
    // third arg in `IS_OUTPUT_BLOCKABLE_UNARY` is dummy
    // TODO: get ride of this macro during the move to NPYV
    if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_float), sizeof(npy_float), 64)) {
        simd_log_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], steps[0]);
    }
    else {
        UNARY_LOOP {
            /*
             * We use the AVX function to compute exp/log for scalar elements as well.
             * This is needed to ensure the output of strided and non-strided
             * cases match. SIMD code handles strided input cases, but not
             * strided output.
             */
            simd_log_FLOAT((npy_float *)op1, (npy_float *)ip1, 1, steps[0]);
        }
    }
#else
    UNARY_LOOP {
        const npy_float in1 = *(npy_float *)ip1;
        *(npy_float *)op1 = npy_logf(in1);
    }
#endif
}


#line 1252
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_exp)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
#ifdef SIMD_AVX512F_NOCLANG_BUG
    if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_double), sizeof(npy_double), 64)) {
        AVX512F_exp_DOUBLE((npy_double*)args[1], (npy_double*)args[0], dimensions[0], steps[0]);
        return;
    }
#endif
    UNARY_LOOP {
        const npy_double in1 = *(npy_double *)ip1;
        *(npy_double *)op1 = npy_exp(in1);
    }
}

#line 1252
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_log)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
#ifdef SIMD_AVX512F_NOCLANG_BUG
    if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_double), sizeof(npy_double), 64)) {
        AVX512F_log_DOUBLE((npy_double*)args[1], (npy_double*)args[0], dimensions[0], steps[0]);
        return;
    }
#endif
    UNARY_LOOP {
        const npy_double in1 = *(npy_double *)ip1;
        *(npy_double *)op1 = npy_log(in1);
    }
}


#line 1275
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_frexp)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
#ifdef SIMD_AVX512_SKX
    if (IS_UNARY_TWO_OUT_SMALL_STEPS_AND_NOMEMOVERLAP) {
        AVX512_SKX_frexp_FLOAT(args, dimensions, steps);
        return;
    }
#endif
    UNARY_LOOP_TWO_OUT {
        const npy_float in1 = *(npy_float *)ip1;
        *((npy_float *)op1) = npy_frexpf(in1, (int *)op2);
    }
}

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_ldexp)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
#ifdef SIMD_AVX512_SKX
    if (IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP) {
        AVX512_SKX_ldexp_FLOAT(args, dimensions, steps);
        return;
    }
#endif
    BINARY_LOOP {
        const npy_float in1 = *(npy_float *)ip1;
        const int in2 = *(int *)ip2;
        *((npy_float *)op1) = npy_ldexpf(in1, in2);
    }
}

#line 1275
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_frexp)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
#ifdef SIMD_AVX512_SKX
    if (IS_UNARY_TWO_OUT_SMALL_STEPS_AND_NOMEMOVERLAP) {
        AVX512_SKX_frexp_DOUBLE(args, dimensions, steps);
        return;
    }
#endif
    UNARY_LOOP_TWO_OUT {
        const npy_double in1 = *(npy_double *)ip1;
        *((npy_double *)op1) = npy_frexp(in1, (int *)op2);
    }
}

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_ldexp)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
#ifdef SIMD_AVX512_SKX
    if (IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP) {
        AVX512_SKX_ldexp_DOUBLE(args, dimensions, steps);
        return;
    }
#endif
    BINARY_LOOP {
        const npy_double in1 = *(npy_double *)ip1;
        const int in2 = *(int *)ip2;
        *((npy_double *)op1) = npy_ldexp(in1, in2);
    }
}


