/*---------\
 | VECTOR.C |
/+----------+---------------------------------\
|                                             |
| Designed and Implimented by Adam Freidin    |
|                                             |
| This vector library provides a              |
|  fairly straitforward set of routines for   |
|  most linear functions.                     |
|                                             |
| Namespacing is similar to the OpenGL API    |
|                                             |
| Notes on usage:                             |
| Src and dst may be the same                 |
|  for all functions, but cases when they     |
|  overlap partially are undefined.           | 
|                                             |
| This library is designed to be a core       |
|  library.                                   |
| For an example of a layer on                |
|  top of vector.c see geom.c,                |
|  a geometry library.                        |
|                                             |
\--------------------------------------------*/

/*--------------------------------------\
| Disclaimer:
|   This code is certified by me to work, and work well,
|   Since I have no credentials as yet,
|    (perhaps other than this code here)
|   my certification means nothing whatsoever.
|
| Simply stated: USE AT YOUR OWN RISK
|
| You have permision to use this library free
|  free programs free of charge. 
| However for anything commercial, I get 
|  a copy of the product.
\---------------------------------------*/

#include<math.h>

#include<stdlib.h>

#include<string.h>

#include<stdarg.h>


#define VCTR_BUILD_DLL

#include"vector.h"


#if VECTOR_MT_SAFE

/* Multithreading safety, all tmp arrays get allocated on the stack */
# define VECTOR_USES_TMP(x) x;

#else

/* For single threading, all tmp arrays are statically allocated, (424 bytes of tmp) */
# define VECTOR_USES_TMP(x)


static double _tmp4d[4]; /* general tmp arrays */
static double _tmp4d2[4]; /* secondary tmp arrays */
static double _crtmp4d[4]; /* cross product tmp arrays */
static float * const _tmp4f   = (float * const)_tmp4d;
static float * const _tmp4f2  = (float * const)_tmp4d2;
static float * const _crtmp4f = (float * const)_crtmp4d;

static double _qtmpd[4];
static double _qtmpd2[4];
static double _qtmpd3[4]; /* quat tmp arrays */
static float * const _qtmpf  = (float *)_qtmpd;
static float * const _qtmpf2 = (float *)_qtmpd2;
static float * const _qtmpf3 = (float *)_qtmpd3;

static double _tmp2x2d[4];
static double _tmp3x3d[9]; /* specific sized matrix tmp arrays */
static float * const _tmp2x2f = (float * const)_tmp2x2d;
static float * const _tmp3x3f = (float * const)_tmp3x3d;

static double _mtmpd[16]; /* generalized matrix tmp arrays */
static float * const _mtmpf = (float * const)_mtmpd;

#endif


/* runtime check of VECTOR_MT_SAFE compiletime constant */
VCTR_R(int) rtVECTOR_MT_SAFE() 
{
  return VECTOR_MT_SAFE;
}

VCTR_R(float*)add2f(float dst[2], const float src[2], const float src2[2])
{
  dst[0] = src[0] + src2[0];
  dst[1] = src[1] + src2[1];
  return dst;
}

VCTR_R(double*)add2d(double dst[2], const double src[2], const double src2[2])
{
  dst[0] = src[0] + src2[0];
  dst[1] = src[1] + src2[1];
  return dst;
}

VCTR_R(float*)sub2f(float dst[2], const float src[2], const float src2[2])
{
  dst[0] = src[0] - src2[0];
  dst[1] = src[1] - src2[1];
  return dst;
}

VCTR_R(double*)sub2d(double dst[2], const double src[2], const double src2[2])
{
  dst[0] = src[0] - src2[0];
  dst[1] = src[1] - src2[1];
  return dst;
}

VCTR_R(float*)mul2f(float dst[2], const float src[2], const float src2)
{
  dst[0] = src[0] * src2;
  dst[1] = src[1] * src2;
  return dst;
}

VCTR_R(double*)mul2d(double dst[2], const double src[2], const double src2)
{
  dst[0] = src[0] * src2;
  dst[1] = src[1] * src2;
  return dst;
}

VCTR_R(float*)mad2f(float dst[2], const float src[2], const float src2[2], float mul)
{
  dst[0] = src[0] + src2[0]*mul;
  dst[1] = src[1] + src2[1]*mul;
  return dst;
}

VCTR_R(double*)mad2d(double dst[2], const double src[2], const double src2[2], double mul)
{
  dst[0] = src[0] + src2[0]*mul;
  dst[1] = src[1] + src2[1]*mul;
  return dst;
}

VCTR_R(float*)neg2f(float d[2], const float s[2])
{
  d[0] = -s[0];
  d[1] = -s[1];
  return d;
}

VCTR_R(double*)neg2d(double d[2], const double s[2])
{
  d[0] = -s[0];
  d[1] = -s[1];
  return d;
}

VCTR_R(float*)neg3f(float d[3], const float s[3])
{
  d[0] = -s[0];
  d[1] = -s[1];
  d[2] = -s[2];
  return d;
}

VCTR_R(double*)neg3d(double d[3], const double s[3])
{
  d[0] = -s[0];
  d[1] = -s[1];
  d[2] = -s[2];
  return d;
}

VCTR_R(float*)neg4f(float d[4], const float s[4])
{
  d[0] = -s[0];
  d[1] = -s[1];
  d[2] = -s[2];
  d[3] = -s[3];
  return d;
}

VCTR_R(double*)neg4d(double d[4], const double s[4])
{
  d[0] = -s[0];
  d[1] = -s[1];
  d[2] = -s[2];
  d[3] = -s[3];
  return d;
}

VCTR_R(float*)set2f(float v[2], float a, float b)
{
  v[0] = a;
  v[1] = b;
  return v;
}

VCTR_R(double*)set2d(double v[2], double a, double b)
{
  v[0] = a;
  v[1] = b;
  return v;
}

VCTR_R(float*)set3f(float v[3], float a, float b, float c)
{
  v[0] = a;
  v[1] = b;
  v[2] = c;
  return v;
}

VCTR_R(double*)set3d(double v[3], double a, double b, double c)
{
  v[0] = a;
  v[1] = b;
  v[2] = c;
  return v;
}

VCTR_R(float*)set4f(float v[4], float a, float b, float c, float d)
{
  v[0] = a;
  v[1] = b;
  v[2] = c;
  v[3] = d;
  return v;
}

VCTR_R(double*)set4d(double v[4], double a, double b, double c, double d)
{
  v[0] = a;
  v[1] = b;
  v[2] = c;
  v[3] = d;
  return v;
}

VCTR_R(float) dot2f(const float src[2], const float src2[2])
{
  return src[0] * src2[0] + 
         src[1] * src2[1];
}

VCTR_R(double)dot2d(const double src[2], const double src2[2])
{
  return src[0] * src2[0] + 
         src[1] * src2[1];
}

VCTR_R(float) dot3f(const float src[3], const float src2[3])
{
  return src[0] * src2[0] + 
         src[1] * src2[1] + 
         src[2] * src2[2];
}

VCTR_R(double)dot3d(const double src[3], const double src2[3])
{
  return src[0] * src2[0] + 
         src[1] * src2[1] + 
         src[2] * src2[2];
}

VCTR_R(float*)add3f(float dst[3], const float src[3], const float src2[3])
{
  dst[0] = src[0] + src2[0];
  dst[1] = src[1] + src2[1];
  dst[2] = src[2] + src2[2];
  return dst;
}

VCTR_R(double*)add3d(double dst[3], const double src[3], const double src2[3])
{
  dst[0] = src[0] + src2[0];
  dst[1] = src[1] + src2[1];
  dst[2] = src[2] + src2[2];
  return dst;
}

VCTR_R(float*)sub3f(float dst[3], const float src[3], const float src2[3])
{
  dst[0] = src[0] - src2[0];
  dst[1] = src[1] - src2[1];
  dst[2] = src[2] - src2[2];
  return dst;
}

VCTR_R(double*)sub3d(double dst[3], const double src[3], const double src2[3])
{
  dst[0] = src[0] - src2[0];
  dst[1] = src[1] - src2[1];
  dst[2] = src[2] - src2[2];
  return dst;
}

VCTR_R(float*)mul3f(float dst[3], const float src[3], const float src2)
{
  dst[0] = src[0] * src2;
  dst[1] = src[1] * src2;
  dst[2] = src[2] * src2;
  return dst;
}

VCTR_R(double*)mul3d(double dst[3], const double src[3], const double src2)
{
  dst[0] = src[0] * src2;
  dst[1] = src[1] * src2;
  dst[2] = src[2] * src2;
  return dst;
}

VCTR_R(float*)mad3f(float dst[3], const float src[3], const float src2[3], float mul)
{
  dst[0] = src[0] + src2[0]*mul;
  dst[1] = src[1] + src2[1]*mul;
  dst[2] = src[2] + src2[2]*mul;
  return dst;
}

VCTR_R(double*)mad3d(double dst[3], const double src[3], const double src2[3], double mul)
{
  dst[0] = src[0] + src2[0]*mul;
  dst[1] = src[1] + src2[1]*mul;
  dst[2] = src[2] + src2[2]*mul;
  return dst;
}

VCTR_R(float*)crossf(float dst[3], const float src[3], const float src2[3])
{
  VECTOR_USES_TMP(float _crtmp4f[4])
  _crtmp4f[0] = src[1] * src2[2] - src[2] * src2[1];
  _crtmp4f[1] = src[2] * src2[0] - src[0] * src2[2];
  _crtmp4f[2] = src[0] * src2[1] - src[1] * src2[0];
  return cpy3f(dst, _crtmp4f);
}

VCTR_R(double*)crossd(double dst[3], const double src[3], const double src2[3])
{
  VECTOR_USES_TMP(double _crtmp4d[4])
  _crtmp4d[0] = src[1] * src2[2] - src[2] * src2[1];
  _crtmp4d[1] = src[2] * src2[0] - src[0] * src2[2];
  _crtmp4d[2] = src[0] * src2[1] - src[1] * src2[0];
  return cpy3d(dst, _crtmp4d);
}

VCTR_R(float*)add4f(float dst[4], const float src[4], const float src2[4])
{
  dst[0] = src[0] + src2[0];
  dst[1] = src[1] + src2[1];
  dst[2] = src[2] + src2[2];
  dst[3] = src[3] + src2[3];
  return dst;
}

VCTR_R(double*)add4d(double dst[4], const double src[4], const double src2[4])
{
  dst[0] = src[0] + src2[0];
  dst[1] = src[1] + src2[1];
  dst[2] = src[2] + src2[2];
  dst[3] = src[3] + src2[3];
  return dst;
}

VCTR_R(float*)sub4f(float dst[4], const float src[4], const float src2[4])
{
  dst[0] = src[0] - src2[0];
  dst[1] = src[1] - src2[1];
  dst[2] = src[2] - src2[2];
  dst[3] = src[3] - src2[3];
  return dst;
}

VCTR_R(double*)sub4d(double dst[4], const double src[4], const double src2[4])
{
  dst[0] = src[0] - src2[0];
  dst[1] = src[1] - src2[1];
  dst[2] = src[2] - src2[2];
  dst[3] = src[3] - src2[3];
  return dst;
}

VCTR_R(double*)mul4d(double dst[4], const double src[4], const double src2)
{
  dst[0] = src[0] * src2;
  dst[1] = src[1] * src2;
  dst[2] = src[2] * src2;
  dst[3] = src[3] * src2;
  return dst;
}

VCTR_R(float*)mul4f(float dst[4], const float src[4], const float src2)
{
  dst[0] = src[0] * src2;
  dst[1] = src[1] * src2;
  dst[2] = src[2] * src2;
  dst[3] = src[3] * src2;
  return dst;
}

VCTR_R(float*)mad4f(float dst[4], const float src[4], const float src2[4], float mul)
{
  dst[0] = src[0] + src2[0]*mul;
  dst[1] = src[1] + src2[1]*mul;
  dst[2] = src[2] + src2[2]*mul;
  dst[3] = src[3] + src2[3]*mul;
  return dst;
}

VCTR_R(double*)mad4d(double dst[4], const double src[4], const double src2[4], double mul)
{
  dst[0] = src[0] + src2[0]*mul;
  dst[1] = src[1] + src2[1]*mul;
  dst[2] = src[2] + src2[2]*mul;
  dst[3] = src[3] + src2[3]*mul;
  return dst;
}

VCTR_R(float) dot4f(const float src[4], const float src2[4])
{
  return src[0] * src2[0] +
         src[1] * src2[1] + 
         src[2] * src2[2] +
         src[3] * src2[3];
}

VCTR_R(double)dot4d(const double src[4], const double src2[4])
{
  return src[0] * src2[0] +
         src[1] * src2[1] + 
         src[2] * src2[2] +
         src[3] * src2[3];
}

/*----------------------------------------------\
| qdot({x,y,z,k},{a,b,c,s}) =                   |
|    qdot({V,k},{U,s})                          | 
| --> {cross(V,U)+sV+kU, sk - dot(V,U)}         |
\----------------------------------------------*/

/*------------------------------------\
| Mathematica version:                |
| Quaternion[                         |
|  (u3 v0)+ u2 v1 - u1 v2 + u0 v3,    |
|  -u2 v0 +(u3 v1)+ u0 v2 + u1 v3,    |
|   u1 v0 - u0 v1 +(u3 v2)+ u2 v3,    |
|                                     |
|  -u0 v0 - u1 v1 - u2 v2 +(u3 v3), ] |
\------------------------------------*/

VCTR_R(float*)qdotf(float dst[4], const float src[4], const float src2[4])
{
  return set4f(dst,
    src2[3]*src[0] + src2[2]*src[1] - src2[1]*src[2] + src2[0]*src[3],
  - src2[2]*src[0] + src2[3]*src[1] + src2[0]*src[2] + src2[1]*src[3],
    src2[1]*src[0] - src2[0]*src[1] + src2[3]*src[2] + src2[2]*src[3],

  - src2[0]*src[0] - src2[1]*src[1] - src2[2]*src[2] + src2[3]*src[3]
  );
}

VCTR_R(double*)qdotd(double dst[4], const double src[4], const double src2[4])
{
  return set4d(dst,
    src2[3]*src[0] + src2[2]*src[1] - src2[1]*src[2] + src2[0]*src[3],
  - src2[2]*src[0] + src2[3]*src[1] + src2[0]*src[2] + src2[1]*src[3],
    src2[1]*src[0] - src2[0]*src[1] + src2[3]*src[2] + src2[2]*src[3],

  - src2[0]*src[0] - src2[1]*src[1] - src2[2]*src[2] + src2[3]*src[3]
  );
}

/* versions which assume src2[3] == 0 */
VCTR_R(float*)q3dotf(float dst[4], const float src[4], const float src2[3])
{
  return set4f(dst,
  /*src2[3]*src[0]*/+ src2[2]*src[1]  - src2[1]*src[2]  + src2[0]*src[3],
  - src2[2]*src[0]/*+ src2[3]*src[1]*/+ src2[0]*src[2]  + src2[1]*src[3],
    src2[1]*src[0]  - src2[0]*src[1]/*+src2[3]*src[2]*/ + src2[2]*src[3],

  - src2[0]*src[0]  - src2[1]*src[1]  - src2[2]*src[2]/*+src2[3]*src[3]*/
  );
}

VCTR_R(double*)q3dotd(double dst[4], const double src[4], const double src2[3])
{
  return set4d(dst,
  /*src2[3]*src[0]*/+ src2[2]*src[1]  - src2[1]*src[2]  + src2[0]*src[3],
  - src2[2]*src[0]/*+ src2[3]*src[1]*/+ src2[0]*src[2]  + src2[1]*src[3],
    src2[1]*src[0]  - src2[0]*src[1]/*+src2[3]*src[2]*/ + src2[2]*src[3],

  - src2[0]*src[0]  - src2[1]*src[1]  - src2[2]*src[2]/*+src2[3]*src[3]*/
  );
}

VCTR_R(float*)qdot3f(float dst[3], const float src[4], const float src2[4])
{
  return set3f(dst,
    src2[3]*src[0] + src2[2]*src[1] - src2[1]*src[2] + src2[0]*src[3],
  - src2[2]*src[0] + src2[3]*src[1] + src2[0]*src[2] + src2[1]*src[3],
    src2[1]*src[0] - src2[0]*src[1] + src2[3]*src[2] + src2[2]*src[3]
  );
}

VCTR_R(double*)qdot3d(double dst[3], const double src[4], const double src2[4])
{
  return set3d(dst,
    src2[3]*src[0] + src2[2]*src[1] - src2[1]*src[2] + src2[0]*src[3],
  - src2[2]*src[0] + src2[3]*src[1] + src2[0]*src[2] + src2[1]*src[3],
    src2[1]*src[0] - src2[0]*src[1] + src2[3]*src[2] + src2[2]*src[3]
  );
}
/* rot(v, axis, angle) = U,  q<qdot>v<qdot>Q = {U,k}    *
 * q = {V,s}, V = sin(angle/2)axis, s = cos(angle/2)    *
 * Q = q^(-1) = {-V,s}                                  */
VCTR_R(float*)rotf(float dst[3], const float src[3], const float axis[3], float angle)
{
  VECTOR_USES_TMP(float _qtmpf[4])

  return qapplyf(dst, src, qrotf(_qtmpf, axis, angle));
}

VCTR_R(double*)rotd(double dst[3], const double src[3], const double axis[3], double angle)
{
  VECTOR_USES_TMP(double _qtmpd[4])

  return qapplyd(dst, src, qrotd(_qtmpd, axis, angle));
}

VCTR_R(float*)qrotf(float dst[4], const float axis[3], float angle)
{
  VECTOR_USES_TMP(float _qtmpf[4])
  double half_angle = (double)angle/2.0;

  _qtmpf[3] = (float)cos(half_angle);
  (void)mul3f(_qtmpf, axis, (float)sin(half_angle));
  return cpy4f(dst, _qtmpf);
}

VCTR_R(double*)qrotd(double dst[4], const double axis[3], double angle)
{
  VECTOR_USES_TMP(double _qtmpd[4])
  double half_angle = angle/2.0;

  _qtmpd[3] = cos(half_angle);
  (void)mul3d(_qtmpd, axis, sin(half_angle));
  return cpy4d(dst, _qtmpd);
}

VCTR_R(float*)qapplyf(float dst[3], const float src[3], const float rot[4])
{
  VECTOR_USES_TMP(float _qtmpf2[4])
  VECTOR_USES_TMP(float _qtmpf3[4])

  neg3f(_qtmpf2, rot);
  _qtmpf2[3] = rot[3];

  return qdot3f(dst, q3dotf(_qtmpf3, rot, src), _qtmpf2);
}

VCTR_R(double*)qapplyd(double dst[3], const double src[3], const double rot[4])
{
  VECTOR_USES_TMP(double _qtmpd2[4])
  VECTOR_USES_TMP(double _qtmpd3[4])

  neg3d(_qtmpd2, rot);
  _qtmpd2[3] = rot[3];

  return qdot3d(dst, q3dotd(_qtmpd3, rot, src), _qtmpd2);
}

/* project(v, ref) = (v<dot>ref)/(norm(ref)^2)ref */
VCTR_R(float*)project2f(float dst[2], const float v[2], const float ref[2])
{
  return mul2f(dst, ref, dot2f(v, ref)/dot2f(ref,ref));
}

VCTR_R(double*)project2d(double dst[2], const double v[2], const double ref[2])
{
  return mul2d(dst, ref, dot2d(v, ref)/dot2d(ref,ref));
}

VCTR_R(double*)project3d(double dst[3], const double v[3], const double ref[3])
{
  return mul3d(dst, ref, dot3d(v, ref)/dot3d(ref,ref));
}

VCTR_R(float*)project3f(float dst[3], const float v[3], const float ref[3])
{
  return mul3f(dst, ref, dot3f(v, ref)/dot3f(ref,ref));
}

VCTR_R(double*)project4d(double dst[4], const double v[4], const double ref[4])
{
  return mul4d(dst, ref, dot4d(v, ref)/dot4d(ref,ref));
}

VCTR_R(float*)project4f(float dst[4], const float v[4], const float ref[4])
{
  return mul4f(dst, ref, dot4f(v, ref));
}

/* projectu(v, ref) = (v<dot>ref)ref (assumes |ref| = 1) */
VCTR_R(float*)projectu2f(float dst[2], const float v[2], const float ref[2])
{
  return mul2f(dst, ref, dot2f(v, ref));
}

VCTR_R(double*)projectu2d(double dst[2], const double v[2], const double ref[2])
{
  return mul2d(dst, ref, dot2d(v, ref));
}

VCTR_R(double*)projectu3d(double dst[3], const double v[3], const double ref[3])
{
  return mul3d(dst, ref, dot3d(v, ref));
}

VCTR_R(float*)projectu3f(float dst[3], const float v[3], const float ref[3])
{
  return mul3f(dst, ref, dot3f(v, ref));
}

VCTR_R(double*)projectu4d(double dst[4], const double v[4], const double ref[4])
{
  return mul4d(dst, ref, dot4d(v, ref));
}

VCTR_R(float*)projectu4f(float dst[4], const float v[4], const float ref[4])
{
  return mul4f(dst, ref, dot4f(v, ref));
}

/* norm(v) = sqrt(v<dot>v) */
VCTR_R(float) norm2f(const float src[2])
{
  return sqrtf(dot2f(src,src));
}

VCTR_R(double)norm2d(const double src[2])
{
  return sqrtd(dot2d(src,src));
}

VCTR_R(float) norm3f(const float src[3])
{
  return sqrtf(dot3f(src,src));
}

VCTR_R(double)norm3d(const double src[3])
{
  return sqrtd(dot3d(src,src));
}

VCTR_R(float) norm4f(const float src[4])
{
  return sqrtf(dot4f(src,src));
}

VCTR_R(double)norm4d(const double src[4])
{
  return sqrtd(dot4d(src,src));
}

/* unit(v) = v/norm(v) = (1.0/norm(v))v */
VCTR_R(float*)unit2f(float dst[2], const float src[2])
{
  return mul2f(dst, src, 1.0f/norm2f(src));
}

VCTR_R(double*)unit2d(double dst[2], const double src[2])
{
  return mul2d(dst, src, 1.0/norm2d(src));
}

VCTR_R(float*)unit3f(float dst[3], const float src[3])
{
  return mul3f(dst, src, 1.0f/norm3f(src));
}

VCTR_R(double*)unit3d(double dst[3], const double src[3])
{
  return mul3d(dst, src, 1.0/norm3d(src));
}

VCTR_R(float*)unit4f(float dst[4], const float src[4])
{
  return mul4f(dst, src, 1.0f/norm4f(src));
}

VCTR_R(double*)unit4d(double dst[4], const double src[4])
{
  return mul4d(dst, src, 1.0/norm4d(src));
}

/* Make an identity matrix */
VCTR_R(float*)ident2x2f(float dst[4])
{
  int i,j;
  for(i=0;i<2;i++)
  for(j=0;j<2;j++)
    dst[i+j*2] = (float)(i == j);
  return dst;
}

VCTR_R(double*)ident2x2d(double dst[4])
{
  int i,j;
  for(i=0;i<2;i++)
  for(j=0;j<2;j++)
    dst[i+j*2] = (double)(i == j);
  return dst;
}

VCTR_R(float*)ident3x3f(float dst[9])
{
  int i,j;
  for(i=0;i<3;i++)
  for(j=0;j<3;j++)
    dst[i+j*3] = (float)(i == j);
  return dst;
}

VCTR_R(double*)ident3x3d(double dst[9])
{
  int i,j;
  for(i=0;i<3;i++)
  for(j=0;j<3;j++)
    dst[i+j*3] = (double)(i == j);
  return dst;
}

VCTR_R(float*)ident4x4f(float dst[16])
{
  int i,j;
  for(i=0;i<4;i++)
  for(j=0;j<4;j++)
    dst[i+j*4] = (float)(i == j);
  return dst;
}

VCTR_R(double*)ident4x4d(double dst[16])
{
  int i,j;
  for(i=0;i<4;i++)
  for(j=0;j<4;j++)
    dst[i+j*4] = (double)(i == j);
  return dst;
}

/* submatrix(M, i, j) = M without row i and column j */
VCTR_R(float*)submatrix3x3f(float dst[4], const float src[9], const int ix, const int jx)
{
  int i, j, idx = 0;
  for(j = 0; j < 3; j++)
  for(i = 0; i < 3; i++)
  {
    if(i != ix && j != jx)
      dst[idx++] = src[i + j*3];
  }
  return dst;
}

VCTR_R(double*)submatrix3x3d(double dst[4], const double src[9], const int ix, const int jx)
{
  int i, j, idx = 0;
  for(j = 0; j < 3; j++)
  for(i = 0; i < 3; i++)
  {
    if(i != ix && j != jx)
      dst[idx++] = src[i + j*3];
  }
  return dst;
}

VCTR_R(float*)submatrix4x4f(float dst[9], const float src[16], const int ix, const int jx)
{
  int i, j, idx = 0;
  for(j = 0; j < 4; j++)
  for(i = 0; i < 4; i++)
  {
    if(i != ix && j != jx)
      dst[idx++] = src[i + j*4];
  }
  return dst;
}

VCTR_R(double*)submatrix4x4d(double dst[9], const double src[16], const int ix, const int jx)
{
  int i, j, idx = 0;
  for(j = 0; j < 4; j++)
  for(i = 0; i < 4; i++)
  {
    if(i != ix && j != jx)
      dst[idx++] = src[i + j*4];
  }
  return dst;
}

VCTR_R(float) det2x2f(const float src[4])
{
  return src[0]*src[3] - src[1]*src[2];
}

VCTR_R(double)det2x2d(const double src[4])
{
  return src[0]*src[3] - src[1]*src[2];
}

VCTR_R(float) det3x3f(const float src[9])
{
  VECTOR_USES_TMP(float _tmp2x2f[4])
  return src[0]*det2x2f(submatrix3x3f(_tmp2x2f, src, 0, 0)) - 
         src[1]*det2x2f(submatrix3x3f(_tmp2x2f, src, 1, 0)) +
         src[2]*det2x2f(submatrix3x3f(_tmp2x2f, src, 2, 0));
}

VCTR_R(double)det3x3d(const double src[9])
{
  VECTOR_USES_TMP(double _tmp2x2d[4])
  return src[0]*det2x2d(submatrix3x3d(_tmp2x2d, src, 0, 0)) - 
         src[1]*det2x2d(submatrix3x3d(_tmp2x2d, src, 1, 0)) +
         src[2]*det2x2d(submatrix3x3d(_tmp2x2d, src, 2, 0));
}

VCTR_R(float) det4x4f(const float src[16])
{
  VECTOR_USES_TMP(float _tmp3x3f[9])
  return src[0]*det3x3f(submatrix4x4f(_tmp3x3f, src, 0, 0)) - 
         src[1]*det3x3f(submatrix4x4f(_tmp3x3f, src, 1, 0)) + 
         src[2]*det3x3f(submatrix4x4f(_tmp3x3f, src, 2, 0)) - 
         src[3]*det3x3f(submatrix4x4f(_tmp3x3f, src, 3, 0));
}

VCTR_R(double)det4x4d(const double src[16])
{
  VECTOR_USES_TMP(double _tmp3x3d[9])
  return src[0]*det3x3d(submatrix4x4d(_tmp3x3d, src, 0, 0)) - 
         src[1]*det3x3d(submatrix4x4d(_tmp3x3d, src, 1, 0)) + 
         src[2]*det3x3d(submatrix4x4d(_tmp3x3d, src, 2, 0)) - 
         src[3]*det3x3d(submatrix4x4d(_tmp3x3d, src, 3, 0));
}

/* M^-1_ij = cofactor(M_ji)/det(M)                 *
 * cofactor(M_ji) = -1^(i+j)*det(submatrix(M,i,j)) */
VCTR_R(float*)invert2x2f(float dst[4], const float src[4], float det)
{
  VECTOR_USES_TMP(float _mtmpf[4])
  float _det = 1.0f/det;
  _mtmpf[0] =  src[3]*_det;
  _mtmpf[1] = -src[2]*_det;
  _mtmpf[2] = -src[1]*_det;
  _mtmpf[3] =  src[0]*_det;
  return cpy2x2f(dst, _mtmpf);
}

VCTR_R(double*)invert2x2d(double dst[4], const double src[4], double det)
{
  VECTOR_USES_TMP(double _mtmpd[4])
  double _det = 1.0/det;
  _mtmpd[0] =  src[3]*_det;
  _mtmpd[1] = -src[2]*_det;
  _mtmpd[2] = -src[1]*_det;
  _mtmpd[3] =  src[0]*_det;
  return cpy2x2d(dst, _mtmpd);
}

VCTR_R(float*)invert3x3f(float dst[9], const float src[9], float det)
{
  VECTOR_USES_TMP(float _mtmpf[9])
  VECTOR_USES_TMP(float _tmp2x2f[4])
  float inv_det;
  int i,j;

  inv_det = 1.0f/det;

  for(j=0;j<3;j++)
  for(i=0;i<3;i++)
  {
    _mtmpf[i+j*3] = cofactor3x3f(src, inv_det, i, j);
  }
  return cpy3x3f(dst, _mtmpf);
}

VCTR_R(double*)invert3x3d(double dst[9], const double src[9], double det)
{
  VECTOR_USES_TMP(double _mtmpd[9])
  VECTOR_USES_TMP(double _tmp2x2d[4])
  double inv_det;
  int i,j;

  inv_det = 1.0/det;

  for(j=0;j<3;j++)
  for(i=0;i<3;i++)
  {
    _mtmpd[i+j*3] = cofactor3x3d(src, inv_det, i, j);
  }
  return cpy3x3d(dst, _mtmpd);
}

VCTR_R(float*)invert4x4f(float dst[16], const float src[16], float det)
{
  VECTOR_USES_TMP(float _mtmpf[16])
  VECTOR_USES_TMP(float _tmp3x3f[9])
  float inv_det;
  int i,j;

  inv_det = 1.0/det;

  for(j=0;j<4;j++)
  for(i=0;i<4;i++)
  {
    _mtmpf[i+j*4] = cofactor4x4f(src, inv_det, i, j);
  }
  return cpy4x4f(dst, _mtmpf);
}

VCTR_R(double*)invert4x4d(double dst[16], const double src[16], double det)
{
  VECTOR_USES_TMP(double _mtmpd[16])
  VECTOR_USES_TMP(double _tmp3x3d[9])
  double inv_det;
  int i,j;

  inv_det = 1.0/det;

  for(j=0;j<4;j++)
  for(i=0;i<4;i++)
  {
    _mtmpd[i+j*4] = cofactor4x4d(src, inv_det, i, j);
  }
  return cpy4x4d(dst, _mtmpd);
}

VCTR_R(float*)invertNxNf(float *dst, int n, const float *src, float det)
{
  float *tmp;
  float inv_det;
  int i,j;

  inv_det = 1.0f/det;
  tmp = malloc(sizeof(float)*n*n);

  for(j=0;j<n;j++)
  for(i=0;i<n;i++)
  {
    tmp[i+j*n] = cofactorNxNf(n, src, inv_det, i, j);
  }
  (void)cpyNxNf(dst, n, tmp);
  free(tmp);
  return dst;
}

VCTR_R(double*)invertNxNd(double *dst, int n, const double *src, double det)
{
  double *tmp;
  double inv_det;
  int i,j;

  inv_det = 1.0/det;
  tmp = malloc(sizeof(double)*n*n);

  for(j=0;j<n;j++)
  for(i=0;i<n;i++)
  {
    tmp[i+j*n] = cofactorNxNd(n, src, inv_det, i, j);
  }
  (void)cpyNxNd(dst, n, tmp);
  free(tmp);
  return dst;
}

VCTR_R(float*)inverse2x2f(float dst[4], const float src[4])
{
  return invert2x2f(dst, src, det2x2f(src));
}

VCTR_R(double*)inverse2x2d(double dst[4], const double src[4])
{
  return invert2x2d(dst, src, det2x2d(src));
}

VCTR_R(float*)inverse3x3f(float dst[9], const float src[9])
{
  return invert3x3f(dst, src, det3x3f(src));
}

VCTR_R(double*)inverse3x3d(double dst[9], const double src[9])
{
  return invert3x3d(dst, src, det3x3d(src));
}

VCTR_R(float*)inverse4x4f(float dst[16], const float src[16])
{
  return invert4x4f(dst, src, det4x4f(src));
}

VCTR_R(double*)inverse4x4d(double dst[16], const double src[16])
{
  return invert4x4d(dst, src, det4x4d(src));
}

VCTR_R(float*)inverseNxNf(float *dst, int n, const float *src)
{
  return invertNxNf(dst, n, src, detNxNf(n, src));
}

VCTR_R(double*)inverseNxNd(double *dst, int n, const double *src)
{
  return invertNxNd(dst, n, src, detNxNd(n, src));
}

VCTR_R(float) cofactor2x2f(const float m[4], float inv_det, int i, int j)
{
  return (-((i+j)%2)*2 + 1)*(inv_det)*m[(i+1)%2 + 2*((j+1)%2)];
}

VCTR_R(double)cofactor2x2d(const double m[4], double inv_det, int i, int j)
{
  return (-((i+j)%2)*2 + 1)*(inv_det)*m[(i+1)%2 + 2*((j+1)%2)];
}

VCTR_R(float) cofactor3x3f(const float m[9], float inv_det, int i, int j)
{
  VECTOR_USES_TMP(float _mtmpf[4]);
  return (-((i+j)%2)*2 + 1)*(inv_det)*det2x2f(submatrix3x3f(_mtmpf, m, i, j));
}

VCTR_R(double)cofactor3x3d(const double m[9], double inv_det, int i, int j)
{
  VECTOR_USES_TMP(double _mtmpd[4]);
  return (-((i+j)%2)*2 + 1)*(inv_det)*det2x2d(submatrix3x3d(_mtmpd, m, i, j));
}

VCTR_R(float) cofactor4x4f(const float m[16], float inv_det, int i, int j)
{
  VECTOR_USES_TMP(float _mtmpf[9]);
  return (-((i+j)%2)*2 + 1)*(inv_det)*det3x3f(submatrix4x4f(_mtmpf, m, i, j));
}

VCTR_R(double)cofactor4x4d(const double m[16], double inv_det, int i, int j)
{
  VECTOR_USES_TMP(double _mtmpd[9]);
  return (-((i+j)%2)*2 + 1)*(inv_det)*det3x3d(submatrix4x4d(_mtmpd, m, i, j));
}

VCTR_R(float*)mul2x2f(float dst[4], const float src[4], const float src2[4])
{
  VECTOR_USES_TMP(float _mtmpf[4])
  int i,j,x;
  float sum;
  for(i=0;i<2;i++)
  for(j=0;j<2;j++)
  {
    sum = 0.0f;
    for(x=0;x<2;x++)
      sum += src[x+j*2]*src2[i+x*2];
    _mtmpf[i+j*2] = sum;
  }
  return cpy2x2f(dst, _mtmpf);
}

VCTR_R(double*)mul2x2d(double dst[4], const double src[4], const double src2[4])
{
  VECTOR_USES_TMP(double _mtmpd[4])
  int i,j,x;
  double sum;
  for(i=0;i<2;i++)
  for(j=0;j<2;j++)
  {
    sum = 0.0;
    for(x=0;x<2;x++)
      sum += src[x+j*2]*src2[i+x*2];
    _mtmpd[i+j*2] = sum;
  }
  return cpy2x2d(dst, _mtmpd);
}

VCTR_R(float*)mul3x3f(float dst[9], const float src[9], const float src2[9])
{
  VECTOR_USES_TMP(float _mtmpf[9])
  int i,j,x;
  float sum;
  for(i=0;i<3;i++)
  for(j=0;j<3;j++)
  {
    sum = 0.0f;
    for(x=0;x<3;x++)
      sum += src[x+j*3]*src2[i+x*3];
    _mtmpf[i+j*3] = sum;
  }
  return cpy3x3f(dst, _mtmpf);
}

VCTR_R(double*)mul3x3d(double dst[9], const double src[9], const double src2[9])
{
  VECTOR_USES_TMP(double _mtmpd[9])
  int i,j,x;
  double sum;
  for(i=0;i<3;i++)
  for(j=0;j<3;j++)
  {
    sum = 0.0;
    for(x=0;x<3;x++)
      sum += src[x+j*3]*src2[i+x*3];
    _mtmpd[i+j*3] = sum;
  }
  return cpy3x3d(dst, _mtmpd);
}

VCTR_R(float*)mul4x4f(float dst[16], const float src[16], const float src2[16])
{
  VECTOR_USES_TMP(float _mtmpf[16])
  int i,j,x;
  float sum;
  for(i=0;i<4;i++)
  for(j=0;j<4;j++)
  {
    sum = 0.0f;
    for(x=0;x<4;x++)
      sum += src[x+j*4]*src2[i+x*4];
    _mtmpf[i+j*4] = sum;
  }
  return cpy4x4f(dst, _mtmpf);
}

VCTR_R(double*)mul4x4d(double dst[16], const double src[16], const double src2[16])
{
  VECTOR_USES_TMP(double _mtmpd[16])
  int i,j,x;
  double sum;
  for(i=0;i<4;i++)
  for(j=0;j<4;j++)
  {
    sum = 0.0;
    for(x=0;x<4;x++)
      sum += src[x+j*4]*src2[i+x*4];
    _mtmpd[i+j*4] = sum;
  }
  return cpy4x4d(dst, _mtmpd);
}


VCTR_R(float*)mulNxNf(float *dst, int n, const float *src, const float *src2)
{
  int i,j,x;
  float sum, *tmp;
  tmp = malloc(sizeof(float)*n*n);
  for(i=0;i<n;i++)
  for(j=0;j<n;j++)
  {
    sum = 0.0f;
    for(x=0;x<n;x++)
      sum += src[x+j*n]*src2[i+x*n];
    tmp[i+j*n] = sum;
  }
  cpyNxNf(dst, n, tmp);
  free(tmp);
  return dst;
}

VCTR_R(double*)mulNxNd(double *dst, int n, const double *src, const double *src2)
{
  int i,j,x;
  double sum, *tmp;
  tmp = malloc(sizeof(double)*n*n);
  for(i=0;i<n;i++)
  for(j=0;j<n;j++)
  {
    sum = 0.0f;
    for(x=0;x<n;x++)
      sum += src[x+j*n]*src2[i+x*n];
    tmp[i+j*n] = sum;
  }
  cpyNxNd(dst, n, tmp);
  free(tmp);
  return dst;
}

/* postmultiply(v,M) = vM */
VCTR_R(float*)postmultiply2x2f(float dst[2], const float v[2], const float m[4])
{
  VECTOR_USES_TMP(float _tmp4f[2])
  _tmp4f[0] = v[0]*m[0] + v[1]*m[2];
  _tmp4f[1] = v[0]*m[1] + v[1]*m[3];
  return cpy2f(dst, _tmp4f);
}

VCTR_R(double*)postmultiply2x2d(double dst[2], const double v[2], const double m[4])
{
  VECTOR_USES_TMP(double _tmp4d[2])
  _tmp4d[0] = v[0]*m[0] + v[1]*m[2];
  _tmp4d[1] = v[0]*m[1] + v[1]*m[3];
  return cpy2d(dst, _tmp4d);
}

VCTR_R(float*)postmultiply3x3f(float dst[3], const float v[3], const float m[9])
{
  VECTOR_USES_TMP(float _tmp4f[3])
  _tmp4f[0] = v[0]*m[0] + v[1]*m[3] + v[2]*m[6];
  _tmp4f[1] = v[0]*m[1] + v[1]*m[4] + v[2]*m[7];
  _tmp4f[2] = v[0]*m[2] + v[1]*m[5] + v[2]*m[8];
  return cpy3f(dst, _tmp4f);
}

VCTR_R(double*)postmultiply3x3d(double dst[3], const double v[3], const double m[9])
{
  VECTOR_USES_TMP(double _tmp4d[3])
  _tmp4d[0] = v[0]*m[0] + v[1]*m[3] + v[2]*m[6];
  _tmp4d[1] = v[0]*m[1] + v[1]*m[4] + v[2]*m[7];
  _tmp4d[2] = v[0]*m[2] + v[1]*m[5] + v[2]*m[8];
  return cpy3d(dst, _tmp4d);
}

VCTR_R(float*)postmultiply4x4f(float dst[4], const float v[4], const float m[16])
{
  VECTOR_USES_TMP(float _tmp4f[4])
  _tmp4f[0] = v[0]*m[0] + v[1]*m[4] + v[2]*m[ 8] + v[3]*m[12];
  _tmp4f[1] = v[0]*m[1] + v[1]*m[5] + v[2]*m[ 9] + v[3]*m[13];
  _tmp4f[2] = v[0]*m[2] + v[1]*m[6] + v[2]*m[10] + v[3]*m[14];
  _tmp4f[3] = v[0]*m[3] + v[1]*m[7] + v[2]*m[11] + v[3]*m[15];
  return cpy4f(dst, _tmp4f);
}

VCTR_R(double*)postmultiply4x4d(double dst[4], const double v[4], const double m[16])
{
  VECTOR_USES_TMP(double _tmp4d[4])
  _tmp4d[0] = v[0]*m[0] + v[1]*m[4] + v[2]*m[ 8] + v[3]*m[12];
  _tmp4d[1] = v[0]*m[1] + v[1]*m[5] + v[2]*m[ 9] + v[3]*m[13];
  _tmp4d[2] = v[0]*m[2] + v[1]*m[6] + v[2]*m[10] + v[3]*m[14];
  _tmp4d[3] = v[0]*m[3] + v[1]*m[7] + v[2]*m[11] + v[3]*m[15];
  return cpy4d(dst, _tmp4d);
}

/* transform(M,v) = Mv */
VCTR_R(float*)transform2x2f(float dst[2], const float m[4], const float v[2])
{
  VECTOR_USES_TMP(float _tmp4f[2])
  _tmp4f[0] = dot2f(v, &m[0]);
  _tmp4f[1] = dot2f(v, &m[2]);
  return cpy2f(dst, _tmp4f);
}

VCTR_R(double*)transform2x2d(double dst[2], const double m[4], const double v[2])
{
  VECTOR_USES_TMP(double _tmp4d[2])
  _tmp4d[0] = dot2d(v, &m[0]);
  _tmp4d[1] = dot2d(v, &m[2]);
  return cpy2d(dst, _tmp4d);
}

VCTR_R(float*)transform3x3f(float dst[3], const float m[9], const float v[3])
{
  VECTOR_USES_TMP(float _tmp4f[3])
  _tmp4f[0] = dot3f(v, &m[0]);
  _tmp4f[1] = dot3f(v, &m[3]);
  _tmp4f[3] = dot3f(v, &m[6]);
  return cpy3f(dst, _tmp4f);
}

VCTR_R(double*)transform3x3d(double dst[3], const double m[9], const double v[3])
{
  VECTOR_USES_TMP(double _tmp4d[3])
  _tmp4d[0] = dot3d(v, &m[0]);
  _tmp4d[1] = dot3d(v, &m[3]);
  _tmp4d[2] = dot3d(v, &m[6]);
  return cpy3d(dst, _tmp4d);
}

VCTR_R(float*)transform4x4f(float dst[4], const float m[16], const float v[4])
{
  VECTOR_USES_TMP(float _tmp4f[4])
  _tmp4f[0] = dot4f(v, &m[ 0]);
  _tmp4f[1] = dot4f(v, &m[ 4]);
  _tmp4f[2] = dot4f(v, &m[ 8]);
  _tmp4f[3] = dot4f(v, &m[12]);
  return cpy4f(dst, _tmp4f);
}

VCTR_R(double*)transform4x4d(double dst[4], const double m[16], const double v[4])
{
  VECTOR_USES_TMP(double _tmp4d[4])
  _tmp4d[0] = dot4d(v, &m[ 0]);
  _tmp4d[1] = dot4d(v, &m[ 4]);
  _tmp4d[2] = dot4d(v, &m[ 8]);
  _tmp4d[3] = dot4d(v, &m[12]);
  return cpy4d(dst, _tmp4d);
}

/* tensor(M,v) = vMv */
VCTR_R(float) tensor2x2f(const float src[4], const float v[2])
{
  VECTOR_USES_TMP(float _tmp4f[2])
  return dot2f(v, transform2x2f(_tmp4f, src, v));
}

VCTR_R(double)tensor2x2d(const double src[4], const double v[2])
{
  VECTOR_USES_TMP(double _tmp4d[2])
  return dot2d(v, transform2x2d(_tmp4d, src, v));
}

VCTR_R(float) tensor3x3f(const float src[9], const float v[3])
{
  VECTOR_USES_TMP(float _tmp4f[3])
  return dot3f(v, transform3x3f(_tmp4f, src, v));
}

VCTR_R(double)tensor3x3d(const double src[9], const double v[3])
{
  VECTOR_USES_TMP(double _tmp4d[3])
  return dot3d(v, transform3x3d(_tmp4d, src, v));
}

VCTR_R(float) tensor4x4f(const float src[16], const float v[4])
{
  VECTOR_USES_TMP(float _tmp4f[4])
  return dot4f(v, transform4x4f(_tmp4f, src, v));
}

VCTR_R(double)tensor4x4d(const double src[16], const double v[4])
{
  VECTOR_USES_TMP(double _tmp4d[4])
  return dot4d(v, transform4x4d(_tmp4d, src, v));
}

/* transpose, flip over main diagonal */
VCTR_R(float*)transpose2x2f(float dst[4], const float src[4])
{
  VECTOR_USES_TMP(float _mtmpf[4])
  int i,j;
  for(i=0;i<2;i++)
  for(j=0;j<2;j++)
    _mtmpf[i+j*2] = src[j+i*2];
  return cpy2x2f(dst, _mtmpf);
}

VCTR_R(double*)transpose2x2d(double dst[4], const double src[4])
{
  VECTOR_USES_TMP(double _mtmpd[4])
  int i,j;
  for(i=0;i<2;i++)
  for(j=0;j<2;j++)
    _mtmpd[i+j*2] = src[j+i*2];
  return cpy2x2d(dst, _mtmpd);
}

VCTR_R(float*)transpose3x3f(float dst[9], const float src[9])
{
  VECTOR_USES_TMP(float _mtmpf[9])
  int i,j;
  for(i=0;i<3;i++)
  for(j=0;j<3;j++)
    _mtmpf[i+j*3] = src[j+i*3];
  return cpy3x3f(dst, _mtmpf);
}

VCTR_R(double*)transpose3x3d(double dst[9], const double src[9])
{
  VECTOR_USES_TMP(double _mtmpd[9])
  int i,j;
  for(i=0;i<3;i++)
  for(j=0;j<3;j++)
    _mtmpd[i+j*3] = src[j+i*3];
  return cpy3x3d(dst, _mtmpd);
}

VCTR_R(float*)transpose4x4f(float dst[16], const float src[16])
{
  VECTOR_USES_TMP(float _mtmpf[16])
  int i,j;
  for(i=0;i<4;i++)
  for(j=0;j<4;j++)
    _mtmpf[i+j*4] = src[j+i*4];
  return cpy4x4f(dst, _mtmpf);
}

VCTR_R(double*)transpose4x4d(double dst[16], const double src[16])
{
  VECTOR_USES_TMP(double _mtmpd[16])
  int i,j;
  for(i=0;i<4;i++)
  for(j=0;j<4;j++)
    _mtmpd[i+j*4] = src[j+i*4];
  return cpy4x4d(dst, _mtmpd);
}

VCTR_R(float*)cpy2f(float d[2], const float s[2])
{
  return (float*)memcpy(d, s, sizeof(float)*2);
}

VCTR_R(double*)cpy2d(double d[2], const double s[2])
{
  return (double*)memcpy(d, s, sizeof(double)*2);
}

VCTR_R(float*)cpy3f(float d[3], const float s[3])
{
  return (float*)memcpy(d, s, sizeof(float)*3);
}

VCTR_R(double*)cpy3d(double d[3], const double s[3])
{
  return (double*)memcpy(d, s, sizeof(double)*3);
}

VCTR_R(float*)cpy4f(float d[4], const float s[4])
{
  return (float*)memcpy(d, s, sizeof(float)*4);
}

VCTR_R(double*)cpy4d(double d[4], const double s[4])
{
  return (double*)memcpy(d, s, sizeof(double)*4);
}

VCTR_R(float*)cpyNf(float *d, int n, const float *s)
{
  return (float*)memcpy(d, s, sizeof(float)*n);
}

VCTR_R(double*)cpyNd(double *d, int n, const double *s)
{
  return (double*)memcpy(d, s, sizeof(double)*n);
}

VCTR_R(float*)cpy2x2f(float d[4], const float s[4])
{
  return (float*)memcpy(d, s, sizeof(float)*4);
}

VCTR_R(double*)cpy2x2d(double d[4], const double s[4])
{
  return (double*)memcpy(d, s, sizeof(double)*4);
}

VCTR_R(float*)cpy3x3f(float d[9], const float s[9])
{
  return (float*)memcpy(d, s, sizeof(float)*9);
}

VCTR_R(double*)cpy3x3d(double d[9], const double s[9])
{
  return (double*)memcpy(d, s, sizeof(double)*9);
}

VCTR_R(float*)cpy4x4f(float d[16], const float s[16])
{
  return (float*)memcpy(d, s, sizeof(float)*16);
}

VCTR_R(double*)cpy4x4d(double d[16], const double s[16])
{
  return (double*)memcpy(d, s, sizeof(double)*16);
}

VCTR_R(int) absmaxindex2f(const float v[2])
{
  return fabsf(v[1]) > fabsf(v[0]);
}

VCTR_R(int) absmaxindex2d(const double v[2])
{
  return fabsd(v[1]) > fabsd(v[0]);
}


  /*------------------------------\
  | UNIX v6 Kernel source comment |
/-+-------------------------------+-------\
| you are not expected to understand this |
\------+--------------------------+-------/
       | But I think it's elegant |
       \-------------------------*/

VCTR_R(int) absmaxindex3f(const float v[3])
{
  VECTOR_USES_TMP(float _tmp4f[3])
  int i[2] = {-1, 2};
  _tmp4f[0] = fabsf(v[0]);
  _tmp4f[1] = fabsf(v[1]);
  _tmp4f[2] = fabsf(v[2]);
  return i[
    _tmp4f[2] > _tmp4f[
      i[0] = _tmp4f[1] > _tmp4f[0]
    ]
  ];
}

VCTR_R(int) absmaxindex3d(const double v[3])
{
  VECTOR_USES_TMP(double _tmp4d[3])
  int i[2] = {-1, 2};
  _tmp4d[0] = fabsd(v[0]);
  _tmp4d[1] = fabsd(v[1]);
  _tmp4d[2] = fabsd(v[2]);
  return i[
    _tmp4d[2] > _tmp4d[
      i[0] = _tmp4d[1] > _tmp4d[0]
    ]
  ];
}

VCTR_R(int) absmaxindex4f(const float v[4])
{
  VECTOR_USES_TMP(float _tmp4f[4])
  int i[2] = {-1, 2};
  int i2[2] = {-1, 3};
  _tmp4f[0] = fabsf(v[0]);
  _tmp4f[1] = fabsf(v[1]);
  _tmp4f[2] = fabsf(v[2]);
  _tmp4f[3] = fabsf(v[3]);
  return i2[
    _tmp4f[3] > _tmp4f[
      i2[0] = i[
        _tmp4f[2] > _tmp4f[
          i[0] = _tmp4f[1] > _tmp4f[0]
        ]
      ]
    ]
  ];
}

VCTR_R(int) absmaxindex4d(const double v[4])
{
  VECTOR_USES_TMP(double _tmp4d[4])
  int i[2] = {-1, 2};
  int i2[2] = {-1, 3};
  _tmp4d[0] = fabsd(v[0]);
  _tmp4d[1] = fabsd(v[1]);
  _tmp4d[2] = fabsd(v[2]);
  _tmp4d[3] = fabsd(v[3]);
  return i2[
    _tmp4d[3] > _tmp4d[
      i2[0] = i[
        _tmp4d[2] > _tmp4d[
          i[0] = _tmp4d[1] > _tmp4d[0]
        ]
      ]
    ]
  ];
}

VCTR_R(float*)cpy2x2to3x3f(float dst[9], const float src[4])
{
  cpy2f(&dst[0*3], &src[0*2]);
  cpy2f(&dst[1*3], &src[1*2]);
  return dst;
}

VCTR_R(double*)cpy2x2to3x3d(double dst[9], const double src[4])
{
  cpy2d(&dst[0*3], &src[0*2]);
  cpy2d(&dst[1*3], &src[1*2]);
  return dst;
}

VCTR_R(float*)cpy3x3to4x4f(float dst[16], const float src[9])
{
  cpy3f(&dst[0*4], &src[0*3]);
  cpy3f(&dst[1*4], &src[1*3]);
  cpy3f(&dst[2*4], &src[2*3]);
  return dst;
}

VCTR_R(double*)cpy3x3to4x4d(double dst[16], const double src[9])
{
  cpy3d(&dst[0*4], &src[0*2]);
  cpy3d(&dst[1*4], &src[1*3]);
  cpy3d(&dst[2*4], &src[2*3]);
  return dst;
}

VCTR_R(float*)cpy4x4to3x3f(float dst[9], const float src[16])
{
  cpy3f(&dst[0*3], &src[0*4]);
  cpy3f(&dst[1*3], &src[1*4]);
  cpy3f(&dst[2*3], &src[2*4]);
  return dst;
}

VCTR_R(double*)cpy4x4to3x3d(double dst[9], const double src[16])
{
  cpy3d(&dst[0*3], &src[0*4]);
  cpy3d(&dst[1*3], &src[1*4]);
  cpy3d(&dst[2*3], &src[2*4]);
  return dst;
}

VCTR_R(float*)cpy3x3to2x2f(float dst[4], const float src[9])
{
  cpy3f(&dst[0*2], &src[0*3]);
  cpy3f(&dst[1*2], &src[1*3]);
  return dst;
}

VCTR_R(double*)cpy3x3to2x2d(double dst[4], const double src[9])
{
  cpy3d(&dst[0*2], &src[0*3]);
  cpy3d(&dst[1*2], &src[1*3]);
  return dst;
}

VCTR_R(float*)cpyNxNtoMxMf(float *dst, int n, const float *src, int m)
{
  int min_nm, i,j;

  min_nm = (n < m ? n : m);

  for(i=0;i<min_nm;i++)
  for(j=0;j<min_nm;j++)
    dst[i + j*n] = src[i + j*m];
  
  return dst;
}

VCTR_R(double*)cpyNxNtoMxMd(double *dst, int n, const double *src, int m)
{
  int min_nm, i,j;

  min_nm = (n < m ? n : m);

  for(i=0;i<min_nm;i++)
  for(j=0;j<min_nm;j++)
    dst[i + j*n] = src[i + j*m];
  
  return dst;
}

VCTR_R(float*)getcolumn2x2f(float v[3], const float m[4], int column)
{
  int row;
  for(row=0;row<2;row++)
    v[row] = m[column + 2*row];
  return v;
}

VCTR_R(double*)getcolumn2x2d(double v[3], const double m[4], int column)
{
  int row;
  for(row=0;row<2;row++)
    v[row] = m[column + 2*row];
  return v;
}

VCTR_R(float*)getcolumn3x3f(float v[3], const float m[9], int column)
{
  int row;
  for(row=0;row<3;row++)
    v[row] = m[column + 3*row];
  return v;
}

VCTR_R(double*)getcolumn3x3d(double v[3], const double m[9], int column)
{
  int row;
  for(row=0;row<3;row++)
    v[row] = m[column + 3*row];
  return v;
}

VCTR_R(float*)getcolumn4x4f(float v[4], const float m[16], int column)
{
  int row;
  for(row=0;row<4;row++)
    v[row] = m[column + 4*row];
  return v;
}

VCTR_R(double*)getcolumn4x4d(double v[4], const double m[16], int column)
{
  int row;
  for(row=0;row<4;row++)
    v[row] = m[column + 4*row];
  return v;
}

VCTR_R(float*)getcolumnNxNf(float *v, int n, const float *m, int column)
{
  int row;
  for(row=0;row<n;row++)
    v[row] = m[column + n*row];
  return v;
}

VCTR_R(double*)getcolumnNxNd(double *v, int n, const double *m, int column)
{
  int row;
  for(row=0;row<n;row++)
    v[row] = m[column + n*row];
  return v;
}

VCTR_R(float*)setcolumn2x2f(float m[4], const float *v, int column, int vstart, int vend)
{
  int row;
  for(row=vstart;row<vend;row++)
    m[column+row*2] = *v++;
  return m;
}

VCTR_R(double*)setcolumn2x2d(double m[4], const double *v, int column, int vstart, int vend)
{
  int row;
  for(row=vstart;row<vend;row++)
    m[column+row*2] = *v++;
  return m;
}

VCTR_R(float*)setcolumn3x3f(float m[9], const float *v, int column, int vstart, int vend)
{
  int row;
  for(row=vstart;row<vend;row++)
    m[column+row*3] = *v++;
  return m;
}

VCTR_R(double*)setcolumn3x3d(double m[9], const double *v, int column, int vstart, int vend)
{
  int row;
  for(row=vstart;row<vend;row++)
    m[column+row*3] = *v++;
  return m;
}

VCTR_R(float*)setcolumn4x4f(float m[16], const float *v, int column, int vstart, int vend)
{
  int row;
  for(row=vstart;row<vend;row++)
    m[column+row*4] = *v++;
  return m;
}

VCTR_R(double*)setcolumn4x4d(double m[16], const double *v, int column, int vstart, int vend)
{
  int row;
  for(row=vstart;row<vend;row++)
    m[column+row*4] = *v++;
  return m;
}

VCTR_R(float*)setcolumnNxNf(float *m, int n, const float *v, int column, int vstart, int vend)
{
  int row;
  for(row=vstart;row<vend;row++)
    m[column+row*n] = *v++;
  return m;
}

VCTR_R(double*)setcolumnNxNd(double *m, int n, const double *v, int column, int vstart, int vend)
{
  int row;
  for(row=vstart;row<vend;row++)
    m[column+row*n] = *v++;
  return m;
}



/*----------------------\
| N-DIMENSIONAL VECTORS |
\----------------------*/

/* (u) + (v) */
VCTR_R(float*)addNf(float *dst, int n, const float *src, const float *src2)
{
  int i;
  for(i = 0; i < n; i++)
    dst[i] = src[i] + src2[i];
  return dst;
}

VCTR_R(double*)addNd(double *dst, int n, const double *src, const double *src2)
{
  int i;
  for(i = 0; i < n; i++)
    dst[i] = src[i] + src2[i];
  return dst;
}

/* (u) - (v) */
VCTR_R(float*)subNf(float *dst, int n, const float *src, const float *src2)
{
  int i;
  for(i = 0; i < n; i++)
    dst[i] = src[i] - src2[i];
  return dst;
}

VCTR_R(double*)subNd(double *dst, int n, const double *src, const double *src2)
{
  int i;
  for(i = 0; i < n; i++)
    dst[i] = src[i] - src2[i];
  return dst;
}

/* a(v) */
VCTR_R(float*)mulNf(float *dst, int n, const float *src, float mul)
{
  int i;
  for(i = 0; i < n; i++)
    dst[i] = src[i]*mul;
  return dst;
}

VCTR_R(double*)mulNd(double *dst, int n, const double *src, double mul)
{
  int i;
  for(i = 0; i < n; i++)
    dst[i] = src[i]*mul;
  return dst;
}

/* v+a(u) */
VCTR_R(float*)madNf(float *dst, int n, const float *src, const float *src2, float mul)
{
  int i;
  for(i = 0; i < n; i++)
    dst[i] = src[i]+src2[i]*mul;
  return dst;
}

VCTR_R(double*)madNd(double *dst, int n, const double *src, const double *src2, double mul)
{
  int i;
  for(i = 0; i < n; i++)
    dst[i] = src[i]+src2[i]*mul;
  return dst;
}

/* -v */
VCTR_R(float*)negNf(float *dst, int n, const float *src)
{
  int i;
  for(i = 0; i < n; i++)
    dst[i] = -src[i];
  return dst;
}

VCTR_R(double*)negNd(double *dst, int n, const double *src)
{
  int i;
  for(i = 0; i < n; i++)
    dst[i] = -src[i];
  return dst;
}

VCTR_R(float*)setNf(float *v, int n, ...)
{
  int i;
  va_list args;

  va_start(args, n);
  for(i = 0; i < n; i++)
    v[i] = (float)va_arg(args, double);
  va_end(args);
  return v;
}

VCTR_R(double*)setNd(double *v, int n, ...)
{
  int i;
  va_list args;

  va_start(args, n);
  for(i = 0; i < n; i++)
    v[i] = va_arg(args, double);
  va_end(args);
  return v;
}


/* (u)<dot>(v) */
VCTR_R(float) dotNf(int n, const float *src, const float *src2)
{
  int i;
  float r = 0.0f;
  for(i=0;i<n;i++)
    r += src[i]+src2[i];
  return r;
}

VCTR_R(double)dotNd(int n, const double *src, const double *src2)
{
  int i;
  double r = 0.0;
  for(i=0;i<n;i++)
    r += src[i]+src2[i];
  return r;
}

/* project (v) onto (ref) */
VCTR_R(float*)projectNf(float *dst, int n, const float *v, const float *ref)
{
  return mulNf(dst, n, ref, dotNf(n, v, ref)/dotNf(n, ref, ref));
}

VCTR_R(double*)projectNd(double *dst, int n, const double *v, const double *ref)
{
  return mulNd(dst, n, ref, dotNd(n, v, ref)/dotNd(n, ref, ref));
}


/* project (v) onto UNIT (ref) */
VCTR_R(float*)projectuNf(float *dst, int n, const float *v, const float *ref)
{
  return mulNf(dst, n, ref, dotNf(n, v, ref));
}

VCTR_R(double*)projectuNd(double *dst, int n, const double *v, const double *ref)
{
  return mulNd(dst, n, ref, dotNd(n, v, ref));
}


/* ||(v)|| */
VCTR_R(float) normNf(const float *src, int n)
{
  return sqrtf(dotNf(n, src, src));
}

VCTR_R(double)normNd(const double *src, int n)
{
  return sqrtd(dotNd(n, src, src));
}


/* (v)/||(v)|| */
VCTR_R(float*)unitNf(float *dst, int n, const float *src)
{
  return mulNf(dst, n, src, 1.0f/normNf(src, n));
}

VCTR_R(double*)unitNd(double *dst, int n, const double *src)
{
  return mulNd(dst, n, src, 1.0/normNd(src, n));
}


/* fast index of greatest (magnitude) component */
VCTR_R(int) absmaxindexNf(const float *v, int n)
{
  int i, max = 0;
  float m = v[max];
  
  for(i = 0; i < n; i++)
  {
    if(v[i] > m)
      max = i, m = v[max];
  }
  return max;
}

VCTR_R(int) absmaxindexNd(const double *v, int n)
{
  int i, max = 0;
  double m = v[max];
  
  for(i = 0; i < n; i++)
  {
    if(v[i] > m)
      max = i, m = v[max];
  }
  return max;
}

/* [I] */
VCTR_R(float*)identNxNf(float *dst, int n)
{
  int i,j;
  for(i=0;i<n;i++)
  for(j=0;i<n;j++)
    dst[i+j*n] = (float)(i==j);
  return dst;
}

VCTR_R(double*)identNxNd(double *dst, int n)
{
  int i,j;
  for(i=0;i<n;i++)
  for(j=0;i<n;j++)
    dst[i+j*n] = (double)(i==j);
  return dst;
}

/* submatrix(M, i, j) = M without row i and column j */
VCTR_R(float*)submatrixNxNf(float *dst, int n, const float *src, const int ix, const int jx)
{
  int i, j, idx = 0;
  for(j = 0; j < n; j++)
  for(i = 0; i < n; i++)
  {
    if(i != ix && j != jx)
      dst[idx++] = src[i + j*n];
  }
  return dst;
}

VCTR_R(double*)submatrixNxNd(double *dst, int n, const double *src, const int ix, const int jx)
{
  int i, j, idx = 0;
  for(j = 0; j < n; j++)
  for(i = 0; i < n; i++)
  {
    if(i != ix && j != jx)
      dst[idx++] = src[i + j*n];
  }
  return dst;
}


VCTR_R(float) detNxNf(int n, const float *src)
{
  int i;
  float s[2] = {1.0f, -1.0f};
  float r = 0.0f, *tmp;
  
  if(n == 2)
    return det2x2f(src);

  tmp = malloc(sizeof(float)*(n-1)*(n-1));
  
  for(i=0;i<n;i++)
    r += s[i%2]*detNxNf(n - 1, submatrixNxNf(tmp, n, src, i, 0));

  free(tmp);
  return r;
}

VCTR_R(double)detNxNd(int n, const double *src)
{
  int i;
  double s[2] = {1.0, -1.0};
  double r = 0.0f, *tmp;
  
  if(n == 2)
    return det2x2d(src);

  tmp = malloc(sizeof(double)*(n-1)*(n-1));
  
  for(i=0;i<n;i++)
    r += s[i%2]*detNxNd(n - 1, submatrixNxNd(tmp, n, src, i, 0));

  free(tmp);
  return r;
}

VCTR_R(float) cofactorNxNf(int n, const float *m, float inv_det, int i, int j)
{
  float *tmp, t;
  static float signf[2] = {1.0f, -1.0f};
  tmp = malloc(sizeof(float)*(n-1)*(n-1));
  t = signf[(i+j)%2]*(inv_det)*detNxNf(n-1, submatrixNxNf(tmp, n, m, i, j));
  free(tmp);
  return t;
}

VCTR_R(double)cofactorNxNd(int n, const double *m, double inv_det, int i, int j)
{  
  double *tmp, t;
  static double signd[2] = {1.0f, -1.0f};
  tmp = malloc(sizeof(double)*(n-1)*(n-1));
  t = signd[(i+j)%2]*(inv_det)*detNxNd(n-1, submatrixNxNd(tmp, n, m, i, j));
  free(tmp);
  return t;
}

VCTR_R(double*)cpyNxNd(double *dst, int n, const double *src)
{
  return (double *)memcpy(dst, src, sizeof(double)*n*n);
}

VCTR_R(float*)cpyNxNf(float *dst, int n, const float *src)
{
  return (float *)memcpy(dst, src, sizeof(float)*n*n);
}

VCTR_R(float*)scale2x2f(float dst[4], const float src[4], float mul)
{
  int i, j;
  for(i=0; i<2; i++)
  for(j=0; j<2; j++)
    dst[i+2*j] = src[i+2*j]*mul;
  return dst; 
}

VCTR_R(double*)scale2x2d(double dst[4], const double src[4], double mul)
{
  int i, j;
  for(i=0; i<2; i++)
  for(j=0; j<2; j++)
    dst[i+2*j] = src[i+2*j]*mul;
  return dst; 
}

VCTR_R(float*)scale3x3f(float dst[9], const float src[9], float mul)
{
  int i, j;
  for(i=0; i<3; i++)
  for(j=0; j<3; j++)
    dst[i+3*j] = src[i+3*j]*mul;
  return dst; 
}

VCTR_R(double*)scale3x3d(double dst[9], const double src[9], double mul)
{
  int i, j;
  for(i=0; i<3; i++)
  for(j=0; j<3; j++)
    dst[i+3*j] = src[i+3*j]*mul;
  return dst; 
}

VCTR_R(float*)scale4x4f(float dst[16], const float src[16], float mul)
{
  int i, j;
  for(i=0; i<4; i++)
  for(j=0; j<4; j++)
    dst[i+4*j] = src[i+4*j]*mul;
  return dst; 
}

VCTR_R(double*)scale4x4d(double dst[16], const double src[16], double mul)
{
  int i, j;
  for(i=0; i<4; i++)
  for(j=0; j<4; j++)
    dst[i+4*j] = src[i+4*j]*mul;
  return dst; 
}

VCTR_R(float*)scaleNxNf(float *dst, int n, const float *src, float mul)
{
  int i, j;
  for(i=0; i<n; i++)
  for(j=0; j<n; j++)
    dst[i+n*j] = src[i+n*j]*mul;
  return dst; 
}

VCTR_R(double*)scaleNxNd(double *dst, int n, const double *src, double mul)
{
  int i, j;
  for(i=0; i<n; i++)
  for(j=0; j<n; j++)
    dst[i+n*j] = src[i+n*j]*mul;
  return dst; 
}


/* postmultiply(v,M) = vM */
VCTR_R(float*)postmultiplyNxNf(float *dst, int n, const float *v, const float *m)
{
  int i,j;
  float *tmp = malloc(sizeof(float)*n);
  
  for(i=0;i<n;i++)
  {
    tmp[i] = 0.0f;
    for(j=0;j<n;j++)
    {
      tmp[i] += v[j]*m[i+j*n];
    }
  }

  cpyNf(dst, n, tmp);
  free(tmp);
  return dst;
}

VCTR_R(double*)postmultiplyNxNd(double *dst, int n, const double *v, const double *m)
{
  int i, j;
  double *tmp = malloc(sizeof(double)*n);
  
  for(i=0;i<n;i++)
  {
    tmp[i] = 0.0;
    for(j=0;j<n;j++)
    {
      tmp[i] += v[j]*m[i+j*n];
    }
  }

  cpyNd(dst, n, tmp);
  free(tmp);
  return dst;
}

/* transform(M,v) = Mv */
VCTR_R(float*)transformNxNf(float *dst, int n, const float *m, const float *v)
{
  int i;
  float *tmp = malloc(sizeof(float)*n);
  
  for(i=0;i<n;i++)
    tmp[i] = dotNf(n, &m[i*n], v);

  cpyNf(dst, n, tmp);
  free(tmp);
  return dst;
}

VCTR_R(double*)transformNxNd(double *dst, int n, const double *m, const double *v)
{
  int i;
  double *tmp = malloc(sizeof(double)*n);
  
  for(i=0;i<n;i++)
    tmp[i] = dotNd(n, &m[i*n], v);

  cpyNd(dst, n, tmp);
  free(tmp);
  return dst;
}

/* tensor(M,v) = vMv */
VCTR_R(float) tensorNxNf(int n, const float *m, const float *v)
{
  float t, *tmp = malloc(sizeof(float)*n);
  t = dotNf(n, v, transformNxNf(tmp, n, m, v));
  free(tmp);
  return t;
}

VCTR_R(double)tensorNxNd(int n, const double *m, const double *v)
{
  double t, *tmp = malloc(sizeof(double)*n);
  t = dotNd(n, v, transformNxNd(tmp, n, m, v));
  free(tmp);
  return t;
}

/* transpose, flip over main diagonal */
VCTR_R(float*)transposeNxNf(float *dst, int n, const float *src)
{
  int i,j;

  for(i=0;i<n;i++)
  for(j=i;j<n;j++)
  {
    float a, b;
    a = src[j + i*n];
    b = src[i + j*n];
    dst[i + j*n] = a;
    dst[j + i*n] = b;
  }

  return dst;
}

VCTR_R(double*)transposeNxNd(double *dst, int n, const double *src)
{
  int i,j;

  for(i=0;i<n;i++)
  for(j=i;j<n;j++)
  {
    double a, b;
    a = src[j + i*n];
    b = src[i + j*n];
    dst[i + j*n] = a;
    dst[j + i*n] = b;
  }

  return dst;
}

VCTR_R(double*)convfd(double *dst, int n, const float *src)
{
  int i;

  for(i = 0; i < n; i++)
    dst[i] = (double)src[i];
  return dst;
}

VCTR_R(float *)convdf(float *dst, int n, const double *src)
{
  int i;

  for(i = 0; i < n; i++)
    dst[i] = (float)src[i];
  return dst;
}

#undef fabsf

#undef fabsd


#undef sqrtf

#undef sqrtd


VCTR_R(float) fabsf(float f)
{
  union {
    float f;
    unsigned long i;
  } fi;
  fi.f = f;
  fi.i &= 0x7FFFFFFF;
  return fi.f;
}

VCTR_R(double)fabsd(double d)
{
  return fabs(d);
}

VCTR_R(float) sqrtf(float f)
{
  return (float)sqrt((double)f);
}

VCTR_R(double)sqrtd(double d)
{
  return sqrt(d);
}