/* mpn_mulmid -- middle product

   Contributed by David Harvey.

   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
   GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.

Copyright 2011 Free Software Foundation, Inc.

This file is part of the GNU MP Library.

The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 3 of the License, or (at your
option) any later version.

The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
License for more details.

You should have received a copy of the GNU Lesser General Public License
along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */


#include "gmp.h"
#include "gmp-impl.h"


#define CHUNK (200 + MULMID_TOOM42_THRESHOLD)


void
mpn_mulmid (mp_ptr rp,
            mp_srcptr ap, mp_size_t an,
            mp_srcptr bp, mp_size_t bn)
{
  mp_size_t rn, k;
  mp_ptr scratch, temp;

  ASSERT (an >= bn);
  ASSERT (bn >= 1);
  ASSERT (! MPN_OVERLAP_P (rp, an - bn + 3, ap, an));
  ASSERT (! MPN_OVERLAP_P (rp, an - bn + 3, bp, bn));

  if (bn < MULMID_TOOM42_THRESHOLD)
    {
      /* region not tall enough to make toom42 worthwhile for any portion */

      if (an < CHUNK)
	{
	  /* region not too wide either, just call basecase directly */
	  mpn_mulmid_basecase (rp, ap, an, bp, bn);
	  return;
	}

      /* Region quite wide. For better locality, use basecase on chunks:

	 AAABBBCC..
	 .AAABBBCC.
	 ..AAABBBCC
      */

      k = CHUNK - bn + 1;    /* number of diagonals per chunk */

      /* first chunk (marked A in the above diagram) */
      mpn_mulmid_basecase (rp, ap, CHUNK, bp, bn);

      /* remaining chunks (B, C, etc) */
      an -= k;

      while (an >= CHUNK)
	{
	  mp_limb_t t0, t1, cy;
	  ap += k, rp += k;
	  t0 = rp[0], t1 = rp[1];
	  mpn_mulmid_basecase (rp, ap, CHUNK, bp, bn);
	  ADDC_LIMB (cy, rp[0], rp[0], t0);    /* add back saved limbs */
	  MPN_INCR_U (rp + 1, k + 1, t1 + cy);
	  an -= k;
	}

      if (an >= bn)
	{
	  /* last remaining chunk */
	  mp_limb_t t0, t1, cy;
	  ap += k, rp += k;
	  t0 = rp[0], t1 = rp[1];
	  mpn_mulmid_basecase (rp, ap, an, bp, bn);
	  ADDC_LIMB (cy, rp[0], rp[0], t0);
	  MPN_INCR_U (rp + 1, an - bn + 2, t1 + cy);
	}

      return;
    }

  /* region is tall enough for toom42 */

  rn = an - bn + 1;

  if (rn < MULMID_TOOM42_THRESHOLD)
    {
      /* region not wide enough to make toom42 worthwhile for any portion */

      TMP_DECL;

      if (bn < CHUNK)
	{
	  /* region not too tall either, just call basecase directly */
	  mpn_mulmid_basecase (rp, ap, an, bp, bn);
	  return;
	}

      /* Region quite tall. For better locality, use basecase on chunks:

	 AAAAA....
	 .AAAAA...
	 ..BBBBB..
	 ...BBBBB.
	 ....CCCCC
      */

      TMP_MARK;

      temp = TMP_ALLOC_LIMBS (rn + 2);

      /* first chunk (marked A in the above diagram) */
      bp += bn - CHUNK, an -= bn - CHUNK;
      mpn_mulmid_basecase (rp, ap, an, bp, CHUNK);

      /* remaining chunks (B, C, etc) */
      bn -= CHUNK;

      while (bn >= CHUNK)
	{
	  ap += CHUNK, bp -= CHUNK;
	  mpn_mulmid_basecase (temp, ap, an, bp, CHUNK);
	  mpn_add_n (rp, rp, temp, rn + 2);
	  bn -= CHUNK;
	}

      if (bn)
	{
	  /* last remaining chunk */
	  ap += CHUNK, bp -= bn;
	  mpn_mulmid_basecase (temp, ap, rn + bn - 1, bp, bn);
	  mpn_add_n (rp, rp, temp, rn + 2);
	}

      TMP_FREE;
      return;
    }

  /* we're definitely going to use toom42 somewhere */

  if (bn > rn)
    {
      /* slice region into chunks, use toom42 on all chunks except possibly
	 the last:

         AA....
         .AA...
         ..BB..
         ...BB.
         ....CC
      */

      TMP_DECL;
      TMP_MARK;

      temp = TMP_ALLOC_LIMBS (rn + 2 + mpn_toom42_mulmid_itch (rn));
      scratch = temp + rn + 2;

      /* first chunk (marked A in the above diagram) */
      bp += bn - rn;
      mpn_toom42_mulmid (rp, ap, bp, rn, scratch);

      /* remaining chunks (B, C, etc) */
      bn -= rn;

      while (bn >= rn)
        {
          ap += rn, bp -= rn;
	  mpn_toom42_mulmid (temp, ap, bp, rn, scratch);
          mpn_add_n (rp, rp, temp, rn + 2);
          bn -= rn;
        }

      if (bn)
        {
          /* last remaining chunk */
          ap += rn, bp -= bn;
	  mpn_mulmid (temp, ap, rn + bn - 1, bp, bn);
          mpn_add_n (rp, rp, temp, rn + 2);
        }

      TMP_FREE;
    }
  else
    {
      /* slice region into chunks, use toom42 on all chunks except possibly
	 the last:

         AAABBBCC..
         .AAABBBCC.
         ..AAABBBCC
      */

      TMP_DECL;
      TMP_MARK;

      scratch = TMP_ALLOC_LIMBS (mpn_toom42_mulmid_itch (bn));

      /* first chunk (marked A in the above diagram) */
      mpn_toom42_mulmid (rp, ap, bp, bn, scratch);

      /* remaining chunks (B, C, etc) */
      rn -= bn;

      while (rn >= bn)
        {
	  mp_limb_t t0, t1, cy;
          ap += bn, rp += bn;
          t0 = rp[0], t1 = rp[1];
          mpn_toom42_mulmid (rp, ap, bp, bn, scratch);
	  ADDC_LIMB (cy, rp[0], rp[0], t0);     /* add back saved limbs */
	  MPN_INCR_U (rp + 1, bn + 1, t1 + cy);
	  rn -= bn;
        }

      TMP_FREE;

      if (rn)
        {
          /* last remaining chunk */
	  mp_limb_t t0, t1, cy;
          ap += bn, rp += bn;
          t0 = rp[0], t1 = rp[1];
          mpn_mulmid (rp, ap, rn + bn - 1, bp, bn);
	  ADDC_LIMB (cy, rp[0], rp[0], t0);
	  MPN_INCR_U (rp + 1, rn + 1, t1 + cy);
        }
    }
}
