convolve.c 9.93 KB
Newer Older
1 2 3 4
/* Karatsuba convolution
 *
 *  Copyright (C) 1999 Ralph Loader <suckfish@ihug.co.nz>
 *
5 6 7 8
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Library General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
9
 *
10 11 12 13
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Library General Public License for more details.
14
 *
15 16
 * You should have received a copy of the GNU Library General Public
 * License along with this library; if not, write to the
Tim-Philipp Müller's avatar
Tim-Philipp Müller committed
17 18
 * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
 * Boston, MA 02110-1301, USA.
19
 *
20 21
 * 
 * Note: 7th December 2004: This file used to be licensed under the GPL,
Thomas Vander Stichele's avatar
Thomas Vander Stichele committed
22
 *       but we got permission from Ralp Loader to relicense it to LGPL.
23
 * 
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
 *  $Id$
 *
 */

/* The algorithm is based on the following.  For the convolution of a pair
 * of pairs, (a,b) * (c,d) = (0, a.c, a.d+b.c, b.d), we can reduce the four
 * multiplications to three, by the formulae a.d+b.c = (a+b).(c+d) - a.c -
 * b.d.  A similar relation enables us to compute a 2n by 2n convolution
 * using 3 n by n convolutions, and thus a 2^n by 2^n convolution using 3^n
 * multiplications (as opposed to the 4^n that the quadratic algorithm
 * takes. */

/* For large n, this is slower than the O(n log n) that the FFT method
 * takes, but we avoid using complex numbers, and we only have to compute
 * one convolution, as opposed to 3 FFTs.  We have good locality-of-
 * reference as well, which will help on CPUs with tiny caches.  */

/* E.g., for a 512 x 512 convolution, the FFT method takes 55 * 512 = 28160
 * (real) multiplications, as opposed to 3^9 = 19683 for the Karatsuba
 * algorithm.  We actually want 257 outputs of a 256 x 512 convolution;
 * that doesn't appear to give an easy advantage for the FFT algorithm, but
 * for the Karatsuba algorithm, it's easy to use two 256 x 256
 * convolutions, taking 2 x 3^8 = 12312 multiplications.  [This difference
 * is that the FFT method "wraps" the arrays, doing a 2^n x 2^n -> 2^n,
 * while the Karatsuba algorithm pads with zeros, doing 2^n x 2^n -> 2.2^n
 * - 1]. */

/* There's a big lie above, actually... for a 4x4 convolution, it's quicker
 * to do it using 16 multiplications than the more complex Karatsuba
 * algorithm...  So the recursion bottoms out at 4x4s.  This increases the
 * number of multiplications by a factor of 16/9, but reduces the overheads
 * dramatically. */

/* The convolution algorithm is implemented as a stack machine.  We have a
 * stack of commands, each in one of the forms "do a 2^n x 2^n
 * convolution", or "combine these three length 2^n outputs into one
 * 2^{n+1} output." */

62 63 64 65
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

66 67 68
#include <stdlib.h>
#include "convolve.h"

Thomas Vander Stichele's avatar
Thomas Vander Stichele committed
69 70 71 72 73 74
typedef union stack_entry_s
{
  struct
  {
    const double *left, *right;
    double *out;
75 76
  }
  v;
Thomas Vander Stichele's avatar
Thomas Vander Stichele committed
77 78 79
  struct
  {
    double *main, *null;
80 81
  }
  b;
82

83 84
}
stack_entry;
85 86 87

#define STACK_SIZE (CONVOLVE_DEPTH * 3)

Thomas Vander Stichele's avatar
Thomas Vander Stichele committed
88 89 90 91 92
struct _struct_convolve_state
{
  double left[CONVOLVE_BIG];
  double right[CONVOLVE_SMALL * 3];
  double scratch[CONVOLVE_SMALL * 3];
93
  stack_entry stack[STACK_SIZE + 1];
94 95 96 97 98 99 100 101
};

/*
 * Initialisation routine - sets up tables and space to work in.
 * Returns a pointer to internal state, to be used when performing calls.
 * On error, returns NULL.
 * The pointer should be freed when it is finished with, by convolve_close().
 */
Thomas Vander Stichele's avatar
Thomas Vander Stichele committed
102 103
convolve_state *
convolve_init (void)
104
{
105
  return (convolve_state *) calloc (1, sizeof (convolve_state));
106 107 108 109 110
}

/*
 * Free the state allocated with convolve_init().
 */
Thomas Vander Stichele's avatar
Thomas Vander Stichele committed
111 112
void
convolve_close (convolve_state * state)
113
{
114
  free (state);
115 116
}

Thomas Vander Stichele's avatar
Thomas Vander Stichele committed
117 118
static void
convolve_4 (double *out, const double *left, const double *right)
119 120 121
/* This does a 4x4 -> 7 convolution.  For what it's worth, the slightly odd
 * ordering gives about a 1% speed up on my Pentium II. */
{
Thomas Vander Stichele's avatar
Thomas Vander Stichele committed
122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
  double l0, l1, l2, l3, r0, r1, r2, r3;
  double a;

  l0 = left[0];
  r0 = right[0];
  a = l0 * r0;
  l1 = left[1];
  r1 = right[1];
  out[0] = a;
  a = (l0 * r1) + (l1 * r0);
  l2 = left[2];
  r2 = right[2];
  out[1] = a;
  a = (l0 * r2) + (l1 * r1) + (l2 * r0);
  l3 = left[3];
  r3 = right[3];
  out[2] = a;

  out[3] = (l0 * r3) + (l1 * r2) + (l2 * r1) + (l3 * r0);
  out[4] = (l1 * r3) + (l2 * r2) + (l3 * r1);
  out[5] = (l2 * r3) + (l3 * r2);
  out[6] = l3 * r3;
144 145
}

Thomas Vander Stichele's avatar
Thomas Vander Stichele committed
146 147
static void
convolve_run (stack_entry * top, unsigned size, double *scratch)
148 149 150 151 152 153 154
/* Interpret a stack of commands.  The stack starts with two entries; the
 * convolution to do, and an illegal entry used to mark the stack top.  The
 * size is the number of entries in each input, and must be a power of 2,
 * and at least 8.  It is OK to have out equal to left and/or right.
 * scratch must have length 3*size.  The number of stack entries needed is
 * 3n-4 where size=2^n. */
{
Thomas Vander Stichele's avatar
Thomas Vander Stichele committed
155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183
  do {
    const double *left;
    const double *right;
    double *out;

    /* When we get here, the stack top is always a convolve,
     * with size > 4.  So we will split it.  We repeatedly split
     * the top entry until we get to size = 4. */

    left = top->v.left;
    right = top->v.right;
    out = top->v.out;
    top++;

    do {
      double *s_left, *s_right;
      int i;

      /* Halve the size. */
      size >>= 1;

      /* Allocate the scratch areas. */
      s_left = scratch + size * 3;
      /* s_right is a length 2*size buffer also used for
       * intermediate output. */
      s_right = scratch + size * 4;

      /* Create the intermediate factors. */
      for (i = 0; i < size; i++) {
184 185
        double l = left[i] + left[i + size];
        double r = right[i] + right[i + size];
Thomas Vander Stichele's avatar
Thomas Vander Stichele committed
186

187 188
        s_left[i + size] = r;
        s_left[i] = l;
Thomas Vander Stichele's avatar
Thomas Vander Stichele committed
189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235
      }

      /* Push the combine entry onto the stack. */
      top -= 3;
      top[2].b.main = out;
      top[2].b.null = NULL;

      /* Push the low entry onto the stack.  This must be
       * the last of the three sub-convolutions, because
       * it may overwrite the arguments. */
      top[1].v.left = left;
      top[1].v.right = right;
      top[1].v.out = out;

      /* Push the mid entry onto the stack. */
      top[0].v.left = s_left;
      top[0].v.right = s_right;
      top[0].v.out = s_right;

      /* Leave the high entry in variables. */
      left += size;
      right += size;
      out += size * 2;

    } while (size > 4);

    /* When we get here, the stack top is a group of 3
     * convolves, with size = 4, followed by some combines.  */
    convolve_4 (out, left, right);
    convolve_4 (top[0].v.out, top[0].v.left, top[0].v.right);
    convolve_4 (top[1].v.out, top[1].v.left, top[1].v.right);
    top += 2;

    /* Now process combines. */
    do {
      /* b.main is the output buffer, mid is the middle
       * part which needs to be adjusted in place, and
       * then folded back into the output.  We do this in
       * a slightly strange way, so as to avoid having
       * two loops. */
      double *out = top->b.main;
      double *mid = scratch + size * 4;
      unsigned int i;

      top++;
      out[size * 2 - 1] = 0;
      for (i = 0; i < size - 1; i++) {
236 237 238 239 240 241 242 243 244
        double lo;
        double hi;

        lo = mid[0] - (out[0] + out[2 * size]) + out[size];
        hi = mid[size] - (out[size] + out[3 * size]) + out[2 * size];
        out[size] = lo;
        out[2 * size] = hi;
        out++;
        mid++;
Thomas Vander Stichele's avatar
Thomas Vander Stichele committed
245 246 247 248
      }
      size <<= 1;
    } while (top->b.null == NULL);
  } while (top->b.main != NULL);
249 250
}

Thomas Vander Stichele's avatar
Thomas Vander Stichele committed
251 252 253
int
convolve_match (const int *lastchoice,
    const short *input, convolve_state * state)
254 255 256 257 258 259 260 261
/* lastchoice is a 256 sized array.  input is a 512 array.  We find the
 * contiguous length 256 sub-array of input that best matches lastchoice.
 * A measure of how good a sub-array is compared with the lastchoice is
 * given by the sum of the products of each pair of entries.  We maximise
 * that, by taking an appropriate convolution, and then finding the maximum
 * entry in the convolutions.  state is a (non-NULL) pointer returned by
 * convolve_init.  */
{
Thomas Vander Stichele's avatar
Thomas Vander Stichele committed
262 263 264 265 266 267 268
  double avg;
  double best;
  int p = 0;
  int i;
  double *left = state->left;
  double *right = state->right;
  double *scratch = state->scratch;
269
  stack_entry *top = state->stack + (STACK_SIZE - 1);
Thomas Vander Stichele's avatar
Thomas Vander Stichele committed
270

271
#if 1
Thomas Vander Stichele's avatar
Thomas Vander Stichele committed
272 273 274 275 276 277 278 279 280 281
  for (i = 0; i < 512; i++)
    left[i] = input[i];

  avg = 0;
  for (i = 0; i < 256; i++) {
    double a = lastchoice[255 - i];

    right[i] = a;
    avg += a;
  }
282
#endif
Thomas Vander Stichele's avatar
Thomas Vander Stichele committed
283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319
  /* We adjust the smaller of the two input arrays to have average
   * value 0.  This makes the eventual result insensitive to both
   * constant offsets and positive multipliers of the inputs. */
  avg /= 256;
  for (i = 0; i < 256; i++)
    right[i] -= avg;
  /* End-of-stack marker. */
  top[1].b.null = scratch;
  top[1].b.main = NULL;
  /* The low 256x256, of which we want the high 256 outputs. */
  top->v.left = left;
  top->v.right = right;
  top->v.out = right + 256;
  convolve_run (top, 256, scratch);

  /* The high 256x256, of which we want the low 256 outputs. */
  top->v.left = left + 256;
  top->v.right = right;
  top->v.out = right;
  convolve_run (top, 256, scratch);

  /* Now find the best position amoungs this.  Apart from the first
   * and last, the required convolution outputs are formed by adding
   * outputs from the two convolutions above. */
  best = right[511];
  right[767] = 0;
  p = -1;
  for (i = 0; i < 256; i++) {
    double a = right[i] + right[i + 512];

    if (a > best) {
      best = a;
      p = i;
    }
  }
  p++;

320
#if 0
Thomas Vander Stichele's avatar
Thomas Vander Stichele committed
321 322 323 324 325 326 327 328 329 330 331 332 333
  {
    /* This is some debugging code... */
    int bad = 0;

    best = 0;
    for (i = 0; i < 256; i++)
      best += ((double) input[i + p]) * ((double) lastchoice[i] - avg);

    for (i = 0; i < 257; i++) {
      double tot = 0;
      unsigned int j;

      for (j = 0; j < 256; j++)
334
        tot += ((double) input[i + j]) * ((double) lastchoice[j] - avg);
Thomas Vander Stichele's avatar
Thomas Vander Stichele committed
335
      if (tot > best)
336
        printf ("(%i)", i);
Thomas Vander Stichele's avatar
Thomas Vander Stichele committed
337
      if (tot != left[i + 255])
338
        printf ("!");
Thomas Vander Stichele's avatar
Thomas Vander Stichele committed
339 340 341 342 343 344 345
    }

    printf ("%i\n", p);
  }
#endif

  return p;
346
}