/* o-----------------------------------------------------------------------------o | | Matrix module implementation | | Methodical Accelerator Design - Copyright (c) 2016+ | Support: http://cern.ch/mad - mad at cern.ch | Authors: L. Deniau, laurent.deniau at cern.ch | Contrib: - | o-----------------------------------------------------------------------------o | You can redistribute this file and/or modify it under the terms of the GNU | General Public License GPLv3 (or later), as published by the Free Software | Foundation. This file is distributed in the hope that it will be useful, but | WITHOUT ANY WARRANTY OF ANY KIND. See http://gnu.org/licenses for details. o-----------------------------------------------------------------------------o */ #include #include #include #include #include #include #include #include "mad_log.h" #include "mad_mem.h" #include "mad_vec.h" #include "mad_mat.h" // --- helpers for debug ------------------------------------------------------o #if 0 #include static void __attribute__((unused)) mprint(str_t name, const num_t a[], ssz_t m, ssz_t n) { printf("%s[%dx%d]=\n", name, m, n); FOR(i,m) { FOR(j,n) printf("% -10.5f ", a[i*n+j]); printf("\n"); } } static void __attribute__((unused)) iprint(str_t name, const idx_t a[], ssz_t m, ssz_t n) { printf("%s[%dx%d]=\n", name, m, n); FOR(i,m) { FOR(j,n) printf("% 3d ", a[i*n+j]); printf("\n"); } } #endif // --- implementation ---------------------------------------------------------o #define CHKR assert( r ) #define CHKX assert( x ) #define CHKXY assert( x && y ) #define CHKXR assert( x && r ) #define CHKYR assert( y && r ) #define CHKXYR assert( x && y && r ) #define CHKXRX assert( x && r && x != r) #define CPX(re,im) (* (cpx_t*) & (num_t[2]) { re, im }) // --- matrix, cmatrix, imatrix struct matrix { ssz_t nr, nc; num_t data[]; }; struct cmatrix { ssz_t nr, nc; cpx_t data[]; }; struct imatrix { ssz_t nr, nc; idx_t data[]; }; // Note: matrix of zero size are forbidden void mad_mat_reshape (struct matrix *x, ssz_t m, ssz_t n) { CHKX; x->nr = MAX(1,m); x->nc = MAX(1,n); } void mad_cmat_reshape (struct cmatrix *x, ssz_t m, ssz_t n) { CHKX; x->nr = MAX(1,m); x->nc = MAX(1,n); } void mad_imat_reshape (struct imatrix *x, ssz_t m, ssz_t n) { CHKX; x->nr = MAX(1,m); x->nc = MAX(1,n); } // ----- #if 1 // r[m x n] = x[m x p] * y[p x n] // naive implementation (more efficient on recent superscalar arch!) #define MMUL() /* mat * mat */ \ FOR(i,m*n) r[i] = 0; \ FOR(i,m) FOR(k,p) FOR(j,n) r[i*n+j] += x[i*p+k] * y[k*n+j]; #else // [m x n] = [m x p] * [p x n] // portable vectorized general matrix-matrix multiplication // loop unroll + vectorized on SSE2 (x2), AVX & AVX2 (x4), AVX-512 (x8) #define MMUL() { /* mat * mat */ \ assert(m>0 && n>0 && p>0); \ if (n & ~7) { \ FOR(i,m) FOR(j,0,n-7,8) { \ r[i*n+j ] = r[i*n+j+1] = \ r[i*n+j+2] = r[i*n+j+3] = \ r[i*n+j+4] = r[i*n+j+5] = \ r[i*n+j+6] = r[i*n+j+7] = 0; \ FOR(k,p) { \ r[i*n+j ] += x[i*p+k] * y[k*n+j ]; \ r[i*n+j+1] += x[i*p+k] * y[k*n+j+1]; \ r[i*n+j+2] += x[i*p+k] * y[k*n+j+2]; \ r[i*n+j+3] += x[i*p+k] * y[k*n+j+3]; \ r[i*n+j+4] += x[i*p+k] * y[k*n+j+4]; \ r[i*n+j+5] += x[i*p+k] * y[k*n+j+5]; \ r[i*n+j+6] += x[i*p+k] * y[k*n+j+6]; \ r[i*n+j+7] += x[i*p+k] * y[k*n+j+7]; \ } \ } \ } \ if (n & 4) { \ idx_t j = n - (n & 7); \ FOR(i,m) { \ r[i*n+j ] = r[i*n+j+1] = \ r[i*n+j+2] = r[i*n+j+3] = 0; \ FOR(k,p) { \ r[i*n+j ] += x[i*p+k] * y[k*n+j ]; \ r[i*n+j+1] += x[i*p+k] * y[k*n+j+1]; \ r[i*n+j+2] += x[i*p+k] * y[k*n+j+2]; \ r[i*n+j+3] += x[i*p+k] * y[k*n+j+3]; \ } \ } \ } \ if (n & 2) { \ idx_t j = n - (n & 3); \ FOR(i,m) { \ r[i*n+j] = r[i*n+j+1] = 0; \ FOR(k,p) { \ r[i*n+j ] += x[i*p+k] * y[k*n+j ]; \ r[i*n+j+1] += x[i*p+k] * y[k*n+j+1]; \ } \ } \ } \ if (n & 1) { \ idx_t j = n - 1; \ FOR(i,m) { \ r[i*n+j] = 0; \ FOR(k,p) r[i*n+j] += x[i*p+k] * y[k*n+j]; \ } \ } \ } #endif // n=1: [m x 1] = [m x p] * [p x 1] #define MULV() /* mat * vec */ \ FOR(i,m) r[i] = 0; \ FOR(i,m) FOR(k,p) r[i] += x[i*p+k] * y[k]; // m=1: [1 x n] = [1 x p] * [p x n] #define VMUL() /* vec * mat */ \ FOR(j,n) r[j] = 0; \ FOR(k,p) FOR(j,n) r[j] += y[k*n+j] * x[k]; // m=1, n=1: [1 x 1] = [1 x p] * [p x 1] #define IMUL() /* vec * vec */ \ r[0] = 0; \ FOR(k,p) r[0] += x[k] * y[k]; // [m x n] = [m x p] * [p x n] #define MUL() \ switch(((m == 1) << 1) & (n == 1)) { \ case 0: MMUL(); break; \ case 1: MULV(); break; \ case 2: VMUL(); break; \ case 3: IMUL(); break; \ } // ----- #if 1 // [m x n] = [p x m]' * [p x n] // naive implementation (more efficient on recent superscalar arch!) #define TMMUL(C) /* mat' * mat */ \ FOR(i,m*n) r[i] = 0; \ FOR(i,m) FOR(k,p) FOR(j,n) r[i*n+j] += C(x[k*m+i]) * y[k*n+j]; #else // [m x n] = [p x m]' * [p x n] // portable vectorized general transpose matrix-matrix multiplication // loop unroll + vectorized on SSE2 (x2), AVX & AVX2 (x4), AVX-512 (x8) #define TMMUL(C) { /* mat' * mat */ \ assert(m>0 && n>0 && p>0); \ if (n & ~7) { \ FOR(i,m) FOR(j,0,n-7,8) { \ r[i*n+j ] = r[i*n+j+1] = \ r[i*n+j+2] = r[i*n+j+3] = \ r[i*n+j+4] = r[i*n+j+5] = \ r[i*n+j+6] = r[i*n+j+7] = 0; \ FOR(k,p) { \ r[i*n+j ] += C(x[k*m+i]) * y[k*n+j ]; \ r[i*n+j+1] += C(x[k*m+i]) * y[k*n+j+1]; \ r[i*n+j+2] += C(x[k*m+i]) * y[k*n+j+2]; \ r[i*n+j+3] += C(x[k*m+i]) * y[k*n+j+3]; \ r[i*n+j+4] += C(x[k*m+i]) * y[k*n+j+4]; \ r[i*n+j+5] += C(x[k*m+i]) * y[k*n+j+5]; \ r[i*n+j+6] += C(x[k*m+i]) * y[k*n+j+6]; \ r[i*n+j+7] += C(x[k*m+i]) * y[k*n+j+7]; \ } \ } \ } \ if (n & 4) { \ idx_t j = n - (n & 7); \ FOR(i,m) { \ r[i*n+j ] = r[i*n+j+1] = \ r[i*n+j+2] = r[i*n+j+3] = 0; \ FOR(k,p) { \ r[i*n+j ] += C(x[k*m+i]) * y[k*n+j ]; \ r[i*n+j+1] += C(x[k*m+i]) * y[k*n+j+1]; \ r[i*n+j+2] += C(x[k*m+i]) * y[k*n+j+2]; \ r[i*n+j+3] += C(x[k*m+i]) * y[k*n+j+3]; \ } \ } \ } \ if (n & 2) { \ idx_t j = n - (n & 3); \ FOR(i,m) { \ r[i*n+j] = r[i*n+j+1] = 0; \ FOR(k,p) { \ r[i*n+j ] += C(x[k*m+i]) * y[k*n+j ]; \ r[i*n+j+1] += C(x[k*m+i]) * y[k*n+j+1]; \ } \ } \ } \ if (n & 1) { \ idx_t j = n - 1; \ FOR(i,m) { \ r[i*n+j] = 0; \ FOR(k,p) r[i*n+j] += C(x[k*m+i]) * y[k*n+j]; \ } \ } \ } #endif // n=1: [m x 1] = [p x m]' * [p x 1] #define TMULV(C) /* mat' * vec */ \ FOR(i,m) r[i] = 0; \ FOR(k,p) FOR(i,m) r[i] += C(x[k*m+i]) * y[k]; // m=1: [1 x n] = [p x 1]' * [p x n] #define TVMUL(C) /* vec' * mat */ \ FOR(j,n) r[j] = 0; \ FOR(k,p) FOR(j,n) r[j] += C(x[k]) * y[k*n+j]; // m=1, n=1: [1 x 1] = [p x 1]' * [p x 1] #define TIMUL(C) /* vec' * vec */ \ r[0]= 0; FOR(k,p) r[0] += C(x[k]) * y[k]; // [m x n] = [p x m]' * [p x n] #define TMUL(C) \ switch(((m == 1) << 1) & (n == 1)) { \ case 0: TMMUL(C); break; \ case 1: TMULV(C); break; \ case 2: TVMUL(C); break; \ case 3: TIMUL(C); break; \ } // ----- #if 1 // [m x n] = [m x p] * [n x p]' // naive implementation (more efficient on recent superscalar arch!) #define MMULT(C) /* mat * mat' */ \ FOR(i,m*n) r[i] = 0; \ FOR(i,m) FOR(j,n) FOR(k,p) r[i*n+j] += x[i*p+k] * C(y[j*p+k]); #else // [m x n] = [m x p] * [n x p]' // portable vectorized general transpose matrix-matrix multiplication // loop unroll + vectorized on SSE2 (x2), AVX & AVX2 (x4), AVX-512 (x8) #define MMULT(C) { /* mat * mat' */ \ assert(m>0 && n>0 && p>0); \ FOR(i,m*n) r[i] = 0; \ if (n & ~7) { \ FOR(i,m) FOR(j,n) FOR(k,0,p-7,8) { \ r[i*n+j] += x[i*p+k ] * C(y[j*p+k ]); \ r[i*n+j] += x[i*p+k+1] * C(y[j*p+k+1]); \ r[i*n+j] += x[i*p+k+2] * C(y[j*p+k+2]); \ r[i*n+j] += x[i*p+k+3] * C(y[j*p+k+3]); \ r[i*n+j] += x[i*p+k+4] * C(y[j*p+k+4]); \ r[i*n+j] += x[i*p+k+5] * C(y[j*p+k+5]); \ r[i*n+j] += x[i*p+k+6] * C(y[j*p+k+6]); \ r[i*n+j] += x[i*p+k+7] * C(y[j*p+k+7]); \ } \ } \ if (p & 4) { \ idx_t k = p - (p & 7); \ FOR(i,m) FOR(j,n) { \ r[i*n+j] += x[i*p+k ] * C(y[j*p+k ]); \ r[i*n+j] += x[i*p+k+1] * C(y[j*p+k+1]); \ r[i*n+j] += x[i*p+k+2] * C(y[j*p+k+2]); \ r[i*n+j] += x[i*p+k+3] * C(y[j*p+k+3]); \ } \ } \ if (p & 2) { \ idx_t k = p - (p & 3); \ FOR(i,m) FOR(j,n) { \ r[i*n+j] += x[i*p+k ] * C(y[j*p+k ]); \ r[i*n+j] += x[i*p+k+1] * C(y[j*p+k+1]); \ } \ } \ if (p & 1) { \ idx_t k = p - 1; \ FOR(i,m) FOR(j,n) r[i*n+j] += x[i*p+k] * C(y[j*p+k]); \ } \ } #endif // n=1: [m x 1] = [m x p] * [1 x p]' #define MULVT(C) /* mat * vec' */ \ FOR(i,m) r[i] = 0; \ FOR(i,m) FOR(k,p) r[i] += x[i*p+k] * C(y[k]); // m=1: [1 x n] = [1 x p]' * [n x p]' #define VMULT(C) /* vec * mat' */ \ FOR(j,n) r[j] = 0; \ FOR(j,n) FOR(k,p) r[j] += x[k] * C(y[j*p+k]); // m=1, n=1: [1 x 1] = [1 x p] * [1 x p]' #define IMULT(C) /* vec * vec' */ \ r[0]= 0; FOR(k,p) r[0] += x[k] * C(y[k]); // [m x n] = [m x p] * [n x p]' #define MULT(C) \ switch(((m == 1) << 1) & (n == 1)) { \ case 0: MMULT(C); break; \ case 1: MULVT(C); break; \ case 2: VMULT(C); break; \ case 3: IMULT(C); break; \ } // ----- // r[m x n] = diag(x[m x p]) * y[p x n] // naive implementation (more efficient on recent superscalar arch!) #define DMUL() /* diag(mat) * mat */ \ if (p == 1) { \ FOR(i,m) FOR(j,n) r[i*n+j] = x[i] * y[i*n+j]; \ } else { \ FOR(i,m*n) r[i] = 0; \ FOR(i,MIN(m,p)) FOR(j,n) r[i*n+j] = x[i*p+i] * y[i*n+j]; \ } // r[m x n] = x[m x p] * diag(y[p x n]) // naive implementation (more efficient on recent superscalar arch!) #define MULD() /* mat * diag(mat) */ \ if (p == 1) { \ FOR(i,m) FOR(j,n) r[i*n+j] = x[i*n+j] * y[j]; \ } else { \ FOR(i,m*n) r[i] = 0; \ FOR(i,m) FOR(j,MIN(n,p)) r[i*n+j] = x[i*p+j] * y[j*n+j]; \ } // ----- // [m x n] transpose #define TRANS(T,C) \ if (m == 1 || n == 1) { \ if (x != r || I != C(I)) \ FOR(i,m*n) r[i] = C(x[i]); \ } else if ((const void*)x != (const void*)r) { \ FOR(i,m) FOR(j,n) r[j*m+i] = C(x[i*n+j]); \ } else if (m == n) { \ FOR(i,m) FOR(j,i,n) { \ T t = C(r[j*m+i]); \ r[j*m+i] = C(r[i*n+j]); \ r[i*n+j] = t; \ } \ } else { \ mad_alloc_tmp(T, t, m*n); \ FOR(i,m) FOR(j,n) t[j*m+i] = C(x[i*n+j]); \ memcpy(r, t, m*n*sizeof(T)); \ mad_free_tmp(t); \ }; // ----- // [m x n] copy [+ op] #define CPY(OP) FOR(i,m) FOR(j,n) r[i*ldr+j] OP##= x[i*ldx+j] // [m x n] set [+ op] #define SET(OP) FOR(i,m) FOR(j,n) r[i*ldr+j] OP##= x // [m x n] sequence [+ op] #define SEQ(OP) FOR(i,m) FOR(j,n) r[i*ldr+j] OP##= (i*ldr+j)+x // [m x n] diagonal [+ op] #define DIAG(OP) FOR(i, MIN(m,n)) r[i*ldr+i] OP##= x // --- mat void mad_mat_eye (num_t r[], num_t v, ssz_t m, ssz_t n, ssz_t ldr) { CHKR; num_t x = 0; SET(); x = v; DIAG(); } void mad_mat_copy (const num_t x[], num_t r[], ssz_t m, ssz_t n, ssz_t ldx, ssz_t ldr) { CHKXRX; CPY(); } void mad_mat_copym (const num_t x[], cpx_t r[], ssz_t m, ssz_t n, ssz_t ldx, ssz_t ldr) { CHKXR; CPY(); } void mad_mat_trans (const num_t x[], num_t r[], ssz_t m, ssz_t n) { CHKXR; TRANS(num_t,); } void mad_mat_mul (const num_t x[], const num_t y[], num_t r[], ssz_t m, ssz_t n, ssz_t p) { CHKXYR; if (x != r && y != r) { MUL(); return; } mad_alloc_tmp(num_t, r_, m*n); num_t *t = r; r = r_; MUL(); mad_vec_copy(r_, t, m*n); mad_free_tmp(r_); } void mad_mat_mulm (const num_t x[], const cpx_t y[], cpx_t r[], ssz_t m, ssz_t n, ssz_t p) { CHKXYR; if (y != r) { MUL(); return; } mad_alloc_tmp(cpx_t, r_, m*n); cpx_t *t = r; r = r_; MUL(); mad_cvec_copy(r_, t, m*n); mad_free_tmp(r_); } void mad_mat_tmul (const num_t x[], const num_t y[], num_t r[], ssz_t m, ssz_t n, ssz_t p) { CHKXYR; if (x != r && y != r) { TMUL(); return; } mad_alloc_tmp(num_t, r_, m*n); num_t *t = r; r = r_; TMUL(); mad_vec_copy(r_, t, m*n); mad_free_tmp(r_); } void mad_mat_tmulm (const num_t x[], const cpx_t y[], cpx_t r[], ssz_t m, ssz_t n, ssz_t p) { CHKXYR; if (y != r) { TMUL(); return; } mad_alloc_tmp(cpx_t, r_, m*n); cpx_t *t = r; r = r_; TMUL(); mad_cvec_copy(r_, t, m*n); mad_free_tmp(r_); } void mad_mat_mult (const num_t x[], const num_t y[], num_t r[], ssz_t m, ssz_t n, ssz_t p) { CHKXYR; if (x != r && y != r) { MULT(); return; } mad_alloc_tmp(num_t, r_, m*n); num_t *t = r; r = r_; MULT(); mad_vec_copy(r_, t, m*n); mad_free_tmp(r_); } void mad_mat_multm (const num_t x[], const cpx_t y[], cpx_t r[], ssz_t m, ssz_t n, ssz_t p) { CHKXYR; if (y != r) { MULT(conj); return; } mad_alloc_tmp(cpx_t, r_, m*n); cpx_t *t = r; r = r_; MULT(conj); mad_cvec_copy(r_, t, m*n); mad_free_tmp(r_); } void mad_mat_dmul (const num_t x[], const num_t y[], num_t r[], ssz_t m, ssz_t n, ssz_t p) { CHKXYR; if (x != r && y != r) { DMUL(); return; } mad_alloc_tmp(num_t, r_, m*n); num_t *t = r; r = r_; DMUL(); mad_vec_copy(r_, t, m*n); mad_free_tmp(r_); } void mad_mat_dmulm (const num_t x[], const cpx_t y[], cpx_t r[], ssz_t m, ssz_t n, ssz_t p) { CHKXYR; if (y != r) { DMUL(); return; } mad_alloc_tmp(cpx_t, r_, m*n); cpx_t *t = r; r = r_; DMUL(); mad_cvec_copy(r_, t, m*n); mad_free_tmp(r_); } void mad_mat_muld (const num_t x[], const num_t y[], num_t r[], ssz_t m, ssz_t n, ssz_t p) { CHKXYR; if (x != r && y != r) { MULD(); return; } mad_alloc_tmp(num_t, r_, m*n); num_t *t = r; r = r_; MULD(); mad_vec_copy(r_, t, m*n); mad_free_tmp(r_); } void mad_mat_muldm (const num_t x[], const cpx_t y[], cpx_t r[], ssz_t m, ssz_t n, ssz_t p) { CHKXYR; if (y != r) { MULD(); return; } mad_alloc_tmp(cpx_t, r_, m*n); cpx_t *t = r; r = r_; MULD(); mad_cvec_copy(r_, t, m*n); mad_free_tmp(r_); } void mad_mat_center (num_t x[], ssz_t m, ssz_t n, int d) { CHKX; num_t mu; switch(d) { // 0=vec, 1=row, 2=col, 3=diag case 0: mu = 0; FOR(k,m*n) mu += x[k]; mu /= m*n; FOR(k,m*n) x[k] -= mu; break; case 1: FOR(i,m) { mu = 0; FOR(j,n) mu += x[i*n+j]; mu /= n; FOR(j,n) x[i*n+j] -= mu; } break; case 2: FOR(j,n) { mu = 0; FOR(i,m) mu += x[i*n+j]; mu /= m; FOR(i,m) x[i*n+j] -= mu; } break; case 3: mu = 0; FOR(k,MIN(m,n)) mu += x[k*n+k]; mu /= MIN(m,n); FOR(k,MIN(m,n)) x[k*n+k] -= mu; break; default: error("invalid direction"); } } void mad_mat_rev (num_t x[], ssz_t m, ssz_t n, int d) { CHKX; num_t t; switch(d) { // 0=vec, 1=row, 2=col, 3=diag case 0: FOR(i,(m*n)/2) SWAP(x[i ], x[ m*n-1-i] , t); break; case 1: FOR(i,m ) FOR(j,n/2) SWAP(x[i*n+j], x[ i *n +n-1-j] , t); break; case 2: FOR(i,m/2) FOR(j,n ) SWAP(x[i*n+j], x[(m-1-i)*n + j] , t); break; case 3: FOR(i,MIN(m,n)/2) SWAP(x[i*n+i], x[(MIN(m,n)-1-i)*(n+1)], t); break; default: error("invalid direction"); } } void mad_mat_roll (num_t x[], ssz_t m, ssz_t n, int mroll, int nroll) { CHKX; mroll %= m; nroll %= n; ssz_t nm = n*m, msz = n*abs(mroll), nsz = abs(nroll); ssz_t sz = msz > nsz ? msz : nsz; mad_alloc_tmp(num_t, a, sz); if (mroll > 0) { mad_vec_copy(x+nm-msz, a , msz); // end of x to a mad_vec_copy(x , x+msz, nm-msz); // shift x down mad_vec_copy(a , x , msz); // a to beginning of x } else if (mroll < 0) { mad_vec_copy(x , a , msz); // beginning of x to a mad_vec_copy(x+msz, x , nm-msz); // shift x up mad_vec_copy(a , x+nm-msz, msz); // a to end of x } if (nroll > 0) FOR(i,0,nm,n) { mad_vec_copy(x+i+n-nsz, a , nsz); // end of x to a mad_vec_copy(x+i , x+i+nsz, n-nsz); // shift x right mad_vec_copy(a , x+i , nsz); // a to beginning of x } else if (nroll < 0) FOR(i,0,nm,n) { mad_vec_copy(x+i , a , nsz); // beginning of x to a mad_vec_copy(x+i+nsz, x+i , n-nsz); // shift x left mad_vec_copy(a , x+i+n-nsz, nsz); // a to end of x } mad_free_tmp(a); } // -- cmat void mad_cmat_eye (cpx_t r[], cpx_t v, ssz_t m, ssz_t n, ssz_t ldr) { CHKR; cpx_t x = 0; SET(); x = v; DIAG(); } void mad_cmat_eye_r (cpx_t r[], num_t v_re, num_t v_im, ssz_t m, ssz_t n, ssz_t ldr) { CHKR; mad_cmat_eye(r, CPX(v_re,v_im), m, n, ldr); } void mad_cmat_copy (const cpx_t x[], cpx_t r[], ssz_t m, ssz_t n, ssz_t ldx, ssz_t ldr) { CHKXRX; CPY(); } void mad_cmat_trans (const cpx_t x[], cpx_t r[], ssz_t m, ssz_t n) { CHKXR; TRANS(cpx_t,); } void mad_cmat_ctrans (const cpx_t x[], cpx_t r[], ssz_t m, ssz_t n) { CHKXR; TRANS(cpx_t,conj); } void mad_cmat_mul (const cpx_t x[], const cpx_t y[], cpx_t r[], ssz_t m, ssz_t n, ssz_t p) { CHKXYR; if (x != r && y != r) { MUL(); return; } mad_alloc_tmp(cpx_t, r_, m*n); cpx_t *t = r; r = r_; MUL(); mad_cvec_copy(r_, t, m*n); mad_free_tmp(r_); } void mad_cmat_mulm (const cpx_t x[], const num_t y[], cpx_t r[], ssz_t m, ssz_t n, ssz_t p) { CHKXYR; if (x != r) { MUL(); return; } mad_alloc_tmp(cpx_t, r_, m*n); cpx_t *t = r; r = r_; MUL(); mad_cvec_copy(r_, t, m*n); mad_free_tmp(r_); } void mad_cmat_tmul (const cpx_t x[], const cpx_t y[], cpx_t r[], ssz_t m, ssz_t n, ssz_t p) { CHKXYR; if (x != r && y != r) { TMUL(conj); return; } mad_alloc_tmp(cpx_t, r_, m*n); cpx_t *t = r; r = r_; TMUL(conj); mad_cvec_copy(r_, t, m*n); mad_free_tmp(r_); } void mad_cmat_tmulm (const cpx_t x[], const num_t y[], cpx_t r[], ssz_t m, ssz_t n, ssz_t p) { CHKXYR; if (x != r) { TMUL(conj); return; } mad_alloc_tmp(cpx_t, r_, m*n); cpx_t *t = r; r = r_; TMUL(conj); mad_cvec_copy(r_, t, m*n); mad_free_tmp(r_); } void mad_cmat_mult (const cpx_t x[], const cpx_t y[], cpx_t r[], ssz_t m, ssz_t n, ssz_t p) { CHKXYR; if (x != r && y != r) { MULT(conj); return; } mad_alloc_tmp(cpx_t, r_, m*n); cpx_t *t = r; r = r_; MULT(conj); mad_cvec_copy(r_, t, m*n); mad_free_tmp(r_); } void mad_cmat_multm (const cpx_t x[], const num_t y[], cpx_t r[], ssz_t m, ssz_t n, ssz_t p) { CHKXYR; if (x != r) { MULT(); return; } mad_alloc_tmp(cpx_t, r_, m*n); cpx_t *t = r; r = r_; MULT(); mad_cvec_copy(r_, t, m*n); mad_free_tmp(r_); } void mad_cmat_dmul (const cpx_t x[], const cpx_t y[], cpx_t r[], ssz_t m, ssz_t n, ssz_t p) { CHKXYR; if (x != r && y != r) { DMUL(); return; } mad_alloc_tmp(cpx_t, r_, m*n); cpx_t *t = r; r = r_; DMUL(); mad_cvec_copy(r_, t, m*n); mad_free_tmp(r_); } void mad_cmat_dmulm (const cpx_t x[], const num_t y[], cpx_t r[], ssz_t m, ssz_t n, ssz_t p) { CHKXYR; if (x != r) { DMUL(); return; } mad_alloc_tmp(cpx_t, r_, m*n); cpx_t *t = r; r = r_; DMUL(); mad_cvec_copy(r_, t, m*n); mad_free_tmp(r_); } void mad_cmat_muld (const cpx_t x[], const cpx_t y[], cpx_t r[], ssz_t m, ssz_t n, ssz_t p) { CHKXYR; if (x != r && y != r) { MULD(); return; } mad_alloc_tmp(cpx_t, r_, m*n); cpx_t *t = r; r = r_; MULD(); mad_cvec_copy(r_, t, m*n); mad_free_tmp(r_); } void mad_cmat_muldm (const cpx_t x[], const num_t y[], cpx_t r[], ssz_t m, ssz_t n, ssz_t p) { CHKXYR; if (x != r) { MULD(); return; } mad_alloc_tmp(cpx_t, r_, m*n); cpx_t *t = r; r = r_; MULD(); mad_cvec_copy(r_, t, m*n); mad_free_tmp(r_); } void mad_cmat_center (cpx_t x[], ssz_t m, ssz_t n, int d) { CHKX; cpx_t mu; switch(d) { // 0=vec, 1=row, 2=col, 3=diag case 0: mu = 0; FOR(k,m*n) mu += x[k]; mu /= m*n; FOR(k,m*n) x[k] -= mu; break; case 1: FOR(i,m) { mu = 0; FOR(j,n) mu += x[i*n+j]; mu /= n; FOR(j,n) x[i*n+j] -= mu; } break; case 2: FOR(j,n) { mu = 0; FOR(i,m) mu += x[i*n+j]; mu /= m; FOR(i,m) x[i*n+j] -= mu; } break; case 3: mu = 0; FOR(k,MIN(m,n)) mu += x[k*n+k]; mu /= MIN(m,n); FOR(k,MIN(m,n)) x[k*n+k] -= mu; break; default: error("invalid direction"); } } void mad_cmat_rev (cpx_t x[], ssz_t m, ssz_t n, int d) { CHKX; cpx_t t; switch(d) { // 0=vec, 1=row, 2=col, 3=diag case 0: FOR(i,(m*n)/2) SWAP(x[i ], x[ m*n-1-i], t); break; case 1: FOR(i,m ) FOR(j,n/2) SWAP(x[i*n +j], x[ i *n +n-1-j], t); break; case 2: FOR(i,m/2) FOR(j,n ) SWAP(x[i*n +j], x[(m-1-i)*n + j], t); break; case 3: FOR(i,MIN(m,n)/2) SWAP(x[i*n +i], x[(m-1-i)*n + i], t); break; default: error("invalid direction"); } } void mad_cmat_roll (cpx_t x[], ssz_t m, ssz_t n, int mroll, int nroll) { mad_mat_roll((num_t*)x, m, 2*n, mroll, 2*nroll); } // --- imat void mad_imat_eye (idx_t r[], idx_t v, ssz_t m, ssz_t n, ssz_t ldr) { CHKR; idx_t x = 0; SET(); x = v; DIAG(); } void mad_imat_copy (const idx_t x[], idx_t r[], ssz_t m, ssz_t n, ssz_t ldx, ssz_t ldr) { CHKXRX; CPY(); } void mad_imat_copym (const idx_t x[], num_t r[], ssz_t m, ssz_t n, ssz_t ldx, ssz_t ldr) { CHKXR; CPY(); } void mad_imat_trans (const idx_t x[], idx_t r[], ssz_t m, ssz_t n) { CHKXR; TRANS(idx_t,); } void mad_imat_rev (idx_t x[], ssz_t m, ssz_t n, int d) { CHKX; idx_t t; switch(d) { // 0=vec, 1=row, 2=col, 3=diag case 0: FOR(i,(m*n)/2) SWAP(x[i ], x[ m*n-1-i], t); break; case 1: FOR(i,m ) FOR(j,n/2) SWAP(x[i*n +j], x[ i *n +n-1-j], t); break; case 2: FOR(i,m/2) FOR(j,n ) SWAP(x[i*n +j], x[(m-1-i)*n + j], t); break; case 3: FOR(i,MIN(m,n)/2) SWAP(x[i*n +i], x[(m-1-i)*n + i], t); break; default: error("invalid direction"); } } void mad_imat_roll (idx_t x[], ssz_t m, ssz_t n, int mroll, int nroll) { CHKX; mroll %= m; nroll %= n; ssz_t nm = n*m, msz = n*abs(mroll), nsz = abs(nroll); ssz_t sz = msz > nsz ? msz : nsz; mad_alloc_tmp(idx_t, a, sz); if (mroll > 0) { // roll rows mad_ivec_copy(x+nm-msz, a , msz); // end of x to a mad_ivec_copy(x , x+msz, nm-msz); // shift x down mad_ivec_copy(a , x , msz); // a to beginning of x } else if (mroll < 0) { mad_ivec_copy(x , a , msz); // beginning of x to a mad_ivec_copy(x+msz, x , nm-msz); // shift x up mad_ivec_copy(a , x+nm-msz, msz); // a to end of x } if (nroll > 0) FOR(i,0,nm,n) { mad_ivec_copy(x+i+n-nsz, a , nsz); // end of x to a mad_ivec_copy(x+i , x+i+nsz, n-nsz); // shift x right mad_ivec_copy(a , x+i , nsz); // a to beginning of x } else if (nroll < 0) FOR(i,0,nm,n) { mad_ivec_copy(x+i , a , nsz); // beginning of x to a mad_ivec_copy(x+i+nsz, x+i , n-nsz); // shift x left mad_ivec_copy(a , x+i+n-nsz, nsz); // a to end of x } mad_free_tmp(a); } // -- Symplectic matrices -----------------------------------------------------o // M[2n x 2n] accessed as n blocks of [a b ; c d] #define a_(x,i,j) x[ i *n+j ] #define b_(x,i,j) x[ i *n+j+1] #define c_(x,i,j) x[(i+1)*n+j ] #define d_(x,i,j) x[(i+1)*n+j+1] // -- Symplecticity error, compute M' J M - J ---------------------------------o num_t mad_mat_symperr (const num_t x[], num_t r_[], ssz_t n, num_t *tol_) { CHKX; assert(!(n & 1)); num_t s=0, s0, s1, s2, s3; ssz_t nn = n*n; mad_alloc_tmp(num_t, r, nn); for (idx_t i = 0; i < n-1; i += 2) { // i == j s1 = -1, s2 = 1; for (idx_t k = 0; k < n-1; k += 2) { s1 += a_(x,k,i) * d_(x,k,i) - b_(x,k,i) * c_(x,k,i); s2 += b_(x,k,i) * c_(x,k,i) - a_(x,k,i) * d_(x,k,i); } s += s1*s1 + s2*s2; b_(r,i,i) = s1, c_(r,i,i) = s2, a_(r,i,i) = d_(r,i,i) = 0; // i < j for (idx_t j = i+2; j < n-1; j += 2) { s0 = s1 = s2 = s3 = 0; for (idx_t k = 0; k < n-1; k += 2) { s0 += a_(x,k,i) * c_(x,k,j) - a_(x,k,j) * c_(x,k,i); s1 += a_(x,k,i) * d_(x,k,j) - b_(x,k,j) * c_(x,k,i); s2 += b_(x,k,i) * c_(x,k,j) - a_(x,k,j) * d_(x,k,i); s3 += b_(x,k,i) * d_(x,k,j) - b_(x,k,j) * d_(x,k,i); } s += 2*(s0*s0 + s1*s1 + s2*s2 + s3*s3); a_(r,i,j) = s0, b_(r,i,j) = s1, c_(r,i,j) = s2, d_(r,i,j) = s3; a_(r,j,i) = -s0, b_(r,j,i) = -s2, c_(r,j,i) = -s1, d_(r,j,i) = -s3; } } if (tol_) { num_t tol = MAX(0,*tol_) ; *tol_ = 1; // is_symp = true for (idx_t i = 0; i < nn; i++) if (fabs(r[i]) > tol) {*tol_ = 0; break;} } if (r_) mad_vec_copy(r, r_, nn); mad_free_tmp(r); return sqrt(s); } num_t mad_cmat_symperr (const cpx_t x[], cpx_t r_[], ssz_t n, num_t *tol_) { CHKX; assert(!(n & 1)); cpx_t s=0, s0, s1, s2, s3; ssz_t nn = n*n; mad_alloc_tmp(cpx_t, r, nn); for (idx_t i = 0; i < n-1; i += 2) { // i == j s1 = -1, s2 = 1; for (idx_t k = 0; k < n-1; k += 2) { s1 += conj(a_(x,k,i)) * d_(x,k,i) - b_(x,k,i) * conj(c_(x,k,i)); s2 += conj(b_(x,k,i)) * c_(x,k,i) - a_(x,k,i) * conj(d_(x,k,i)); } s += s1*s1 + s2*s2; b_(r,i,i) = s1, c_(r,i,i) = s2, a_(r,i,i) = d_(r,i,i) = 0; // i < j for (idx_t j = i+2; j < n-1; j += 2) { s0 = s1 = s2 = s3 = 0; for (idx_t k = 0; k < n-1; k += 2) { s0 += conj(a_(x,k,i)) * c_(x,k,j) - a_(x,k,j) * conj(c_(x,k,i)); s1 += conj(a_(x,k,i)) * d_(x,k,j) - b_(x,k,j) * conj(c_(x,k,i)); s2 += conj(b_(x,k,i)) * c_(x,k,j) - a_(x,k,j) * conj(d_(x,k,i)); s3 += conj(b_(x,k,i)) * d_(x,k,j) - b_(x,k,j) * conj(d_(x,k,i)); } s += 2*(s0*s0 + s1*s1 + s2*s2 + s3*s3); a_(r,i,j) = s0, b_(r,i,j) = s1, c_(r,i,j) = s2, d_(r,i,j) = s3; a_(r,j,i) = -s0, b_(r,j,i) = -s2, c_(r,j,i) = -s1, d_(r,j,i) = -s3; } } if (tol_) { num_t tol = MAX(0,*tol_) ; *tol_ = 1; // is_symp = true for (idx_t i = 0; i < nn; i++) if (cabs(r[i]) > tol) {*tol_ = 0; break;} } if (r_) mad_cvec_copy(r, r_, nn); mad_free_tmp(r); return sqrt(cabs(s)); } // -- Symplectic conjugate, compute \bar{M} = -J M' J -------------------------o void mad_mat_sympconj (const num_t x[], num_t r[], ssz_t n) { CHKXR; assert(!(n & 1)); num_t t; for (idx_t i = 0; i < n-1; i += 2) { // 2x2 blocks on diagonal t = a_(x,i,i), a_(r,i,i) = d_(x,i,i), d_(r,i,i) = t; b_(r,i,i) = -b_(x,i,i), c_(r,i,i) = -c_(x,i,i); for (idx_t j = i+2; j < n-1; j += 2) { // 2x2 blocks off diagonal t = a_(x,i,j), a_(r,i,j) = d_(x,j,i), d_(r,j,i) = t; t = b_(x,i,j), b_(r,i,j) = -b_(x,j,i), b_(r,j,i) = -t; t = c_(x,i,j), c_(r,i,j) = -c_(x,j,i), c_(r,j,i) = -t; t = d_(x,i,j), d_(r,i,j) = a_(x,j,i), a_(r,j,i) = t; } } } void mad_cmat_sympconj (const cpx_t x[], cpx_t r[], ssz_t n) { CHKXR; assert(!(n & 1)); cpx_t t; for (idx_t i = 0; i < n-1; i += 2) { // 2x2 blocks on diagonal t = a_(x,i,i), a_(r,i,i) = conj(d_(x,i,i)), d_(r,i,i) = conj(t); b_(r,i,i) = -conj(b_(x,i,i)), c_(r,i,i) = -conj(c_(x,i,i)); for (idx_t j = i+2; j < n-1; j += 2) { // 2x2 blocks off diagonal t = a_(x,i,j), a_(r,i,j) = conj(d_(x,j,i)), d_(r,j,i) = conj(t); t = b_(x,i,j), b_(r,i,j) = -conj(b_(x,j,i)), b_(r,j,i) = -conj(t); t = c_(x,i,j), c_(r,i,j) = -conj(c_(x,j,i)), c_(r,j,i) = -conj(t); t = d_(x,i,j), d_(r,i,j) = conj(a_(x,j,i)), a_(r,j,i) = conj(t); } } } #undef a_ #undef b_ #undef c_ #undef d_ // -- lapack ------------------------------------------------------------------o /* LAPACK is the default method for computing LU decomposition. When matrix is square and nonsinguler the routines dgetrf and zgetrf. LAPACK is the default method for solving linear least squares problems for dense matrices. When the matrix is square and nonsingular the routines dgesv and zgesv are used otherwise routines dgelsy and zgelsy are used. Alternate methods dgelsd and zgelsd based on SVD can be used too. The general least squares problems for dense matrices uses dgglse and zgglse for the first kind, and dggglm and zggglm for the second kind. LAPACK is the default method for computing the entire set of singluar values and singular vectors. For generalized SVD the routines dgesdd and zgesdd are used. LAPACK is the default method for computing the entire set of eigenvalues and eigenvectors. For simple eigenvalues the routines dgeev and zgeev are used. For generalized eigenvalues the routines dggev and zggev are used. LAPACK C examples using F77 interface can be found at: https://software.intel.com/sites/products/documentation/doclib/mkl_sa/11/mkl_lapack_examples/ LAPACK F77 examples with data can be found at: https://github.com/numericalalgorithmsgroup/LAPACK_Examples/tree/master/examples */ // ----- // Decompose A = LU with A[m x n] (generalized) // ----- void dgetrf_ (const int *m, const int *n, num_t A[], const int *lda, int *IPIV, int *info); void zgetrf_ (const int *m, const int *n, cpx_t A[], const int *lda, int *IPIV, int *info); // ----- // Solve A * X = B with A[n x n], B[n x nrhs] and X[n x nrhs]: search min |} b - Ax ||_2 using LU // ----- void dgesv_ (const int *n, const int *nrhs, num_t A[], const int *lda, int *IPIV, num_t B[], const int *ldb, int *info); void zgesv_ (const int *n, const int *nrhs, cpx_t A[], const int *lda, int *IPIV, cpx_t B[], const int *ldb, int *info); // ----- // Solve A * X = B with A[m x n], B[m x nrhs] and X[m x nrhs]: search min || b - Ax ||_2 using QR // ----- void dgelsy_ (const int *m, const int *n, const int *nrhs, num_t A[], const int *lda, num_t B[], const int *ldb, int jpvt[], const num_t *rcond, int *rank, num_t work[], const int lwork[], int *info); void zgelsy_ (const int *m, const int *n, const int *nrhs, cpx_t A[], const int *lda, cpx_t B[], const int *ldb, int jpvt[], const num_t *rcond, int *rank, cpx_t work[], const int lwork[], num_t rwork[], int *info); // ----- // Solve A * X = B with A[m x n], B[m x nrhs] and X[m x nrhs]: search min || b - Ax ||_2 using SVD // ----- void dgelsd_ (const int *m, const int *n, const int *nrhs, num_t A[], const int *lda, num_t B[], const int *ldb, num_t S[], const num_t *rcond, int *rank, num_t work[], int *lwork, int iwork[], int *info); void zgelsd_ (const int *m, const int *n, const int *nrhs, cpx_t A[], const int *lda, cpx_t B[], const int *ldb, num_t S[], const num_t *rcond, int *rank, cpx_t work[], int *lwork, num_t rwork[], int iwork[], int *info); // ----- // LS minimization: min_x || c - A*x ||_2 subject to B*x = d using QR // ----- void dgglse_ (const int *m, const int *n, const int *p, num_t A[], const int *lda, num_t B[], const int *ldb, num_t C[], num_t D[], num_t X[], num_t work[], int *lwork, int *info); void zgglse_ (const int *m, const int *n, const int *p, cpx_t A[], const int *lda, cpx_t B[], const int *ldb, cpx_t C[], cpx_t D[], cpx_t X[], cpx_t work[], int *lwork, int *info); // ----- // LS minimization: min_x || y ||_2 subject to A*x + B*y = d using QR // ----- void dggglm_ (const int *m, const int *n, const int *p, num_t A[], const int *lda, num_t B[], const int *ldb, num_t D[], num_t X[], num_t Y[], num_t work[], int *lwork, int *info); void zggglm_ (const int *m, const int *n, const int *p, cpx_t A[], const int *lda, cpx_t B[], const int *ldb, cpx_t D[], cpx_t X[], cpx_t Y[], cpx_t work[], int *lwork, int *info); // ----- // SVD A[m x n] // ----- void dgesdd_ (str_t jobz, const int *m, const int *n, num_t A[], const int *lda, num_t S[], num_t U[], const int *ldu, num_t VT[], const int *ldvt, num_t work[], int *lwork, int iwork[], int *info); void zgesdd_ (str_t jobz, const int *m, const int *n, cpx_t A[], const int *lda, num_t S[], cpx_t U[], const int *ldu, cpx_t VT[], const int *ldvt, cpx_t work[], int *lwork, num_t rwork[], int iwork[], int *info); // ----- // Eigen values/vectors A[n x n] // ----- void dgeev_ (str_t jobvl, str_t jobvr, const int *n, num_t A[], const int *lda, num_t WR[], num_t WI[], num_t VL[], const int *ldvl, num_t VR[], const int *ldvr, num_t work[], int *lwork, int *info); void zgeev_ (str_t jobvl, str_t jobvr, const int *n, cpx_t A[], const int *lda, cpx_t W[], cpx_t VL[], const int *ldvl, cpx_t VR[], const int *ldvr, cpx_t work[], int *lwork, num_t rwork[], int *info); // -- determinant -------------------------------------------------------------o int mad_mat_det (const num_t x[], num_t *r, ssz_t n) { CHKX; const int nn=n; int info=0, ipiv[n]; mad_alloc_tmp(num_t, a, n*n); mad_vec_copy(x, a, n*n); dgetrf_(&nn, &nn, a, &nn, ipiv, &info); if (info < 0) error("Det: invalid input argument"); int perm = 0; num_t det = 1; for (int i=0, j=0; i < n; i++, j+=n+1) det *= a[j], perm += ipiv[i] != i+1; mad_free_tmp(a); *r = perm & 1 ? -det : det; return info; } int mad_cmat_det (const cpx_t x[], cpx_t *r, ssz_t n) { CHKX; const int nn=n; int info=0, ipiv[n]; mad_alloc_tmp(cpx_t, a, n*n); mad_cvec_copy(x, a, n*n); zgetrf_(&nn, &nn, a, &nn, ipiv, &info); if (info < 0) error("Det: invalid input argument"); int perm = 0; cpx_t det = 1; for (int i=0, j=0; i < n; i++, j+=n+1) det *= a[j], perm += ipiv[i] != i+1; mad_free_tmp(a); *r = perm & 1 ? -det : det; return info; } // -- inverse -----------------------------------------------------------------o int mad_mat_invn (const num_t y[], num_t x, num_t r[], ssz_t m, ssz_t n, num_t rcond) { CHKYR; // compute U:[n x n]/Y:[m x n] mad_alloc_tmp(num_t, u, n*n); mad_mat_eye(u, 1, n, n, n); #pragma GCC diagnostic push // remove false-positive #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" int rank = mad_mat_div(u, y, r, n, m, n, rcond); #pragma GCC diagnostic pop mad_free_tmp(u); if (x != 1) mad_vec_muln(r, x, r, m*n); return rank; } int // without complex-by-value version mad_mat_invc_r (const num_t y[], num_t x_re, num_t x_im, cpx_t r[], ssz_t m, ssz_t n, num_t rcond) { return mad_mat_invc(y, CPX(x_re,x_im), r, m, n, rcond); } int mad_mat_invc (const num_t y[], cpx_t x, cpx_t r[], ssz_t m, ssz_t n, num_t rcond) { CHKYR; // compute U:[n x n]/Y:[m x n] mad_alloc_tmp(num_t, t, m*n); mad_alloc_tmp(num_t, u, n*n); mad_mat_eye(u, 1, n, n, n); int rank = mad_mat_div(u, y, t, n, m, n, rcond); mad_free_tmp(u); if (x != 1) mad_vec_mulc(t, x, r, m*n); mad_free_tmp(t); return rank; } int mad_cmat_invn (const cpx_t y[], num_t x, cpx_t r[], ssz_t m, ssz_t n, num_t rcond) { CHKYR; // compute U:[n x n]/Y:[m x n] mad_alloc_tmp(cpx_t, u, n*n); mad_cmat_eye(u, 1, n, n, n); #pragma GCC diagnostic push // remove false-positive #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" int rank = mad_cmat_div(u, y, r, n, m, n, rcond); #pragma GCC diagnostic pop mad_free_tmp(u); if (x != 1) mad_cvec_muln(r, x, r, m*n); return rank; } int mad_cmat_invc (const cpx_t y[], cpx_t x, cpx_t r[], ssz_t m, ssz_t n, num_t rcond) { CHKYR; // compute U:[n x n]/Y:[m x n] mad_alloc_tmp(cpx_t, u, n*n); mad_cmat_eye(u, 1, n, n, n); #pragma GCC diagnostic push // remove false-positive #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" int rank = mad_cmat_div(u, y, r, n, m, n, rcond); #pragma GCC diagnostic pop mad_free_tmp(u); if (x != 1) mad_cvec_mulc(r, x, r, m*n); return rank; } int mad_cmat_invc_r (const cpx_t y[], num_t x_re, num_t x_im, cpx_t r[], ssz_t m, ssz_t n, num_t rcond) { return mad_cmat_invc(y, CPX(x_re,x_im), r, m, n, rcond); } // -- divide ------------------------------------------------------------------o // note: // X/Y => X * Y^-1 => [m x p] * [p x n] => X:[m x p], Y:[n x p] // X/Y => X * Y^-1 => (Y'^-1 * X')' => A=Y' and B=X' // Solving A*X=B => X = A^-1 B = (B'/A')' (col-major!) // with Y':[p x n] = A:[M=p x N=n], // and X':[p x m] = B:[M=p x NRHS=m], ipiv:[N] int mad_mat_div (const num_t x[], const num_t y[], num_t r[], ssz_t m, ssz_t n, ssz_t p, num_t rcond) { CHKXYR; int info=0; const int nm=m, nn=n, np=p; mad_alloc_tmp(num_t, a, n*p); mad_vec_copy(y, a, n*p); // square system (y is square, n == p), use LU decomposition if (n == p) { int ipiv[n]; mad_vec_copy(x, r, m*p); dgesv_(&np, &nm, a, &np, ipiv, r, &np, &info); if (!info) return mad_free_tmp(a), n; if (info > 0) warn("Div: singular matrix, no solution found"); } // non-square system or singular square system, use QR or LQ factorization num_t sz; int rank, ldb=MAX(nn,np), lwork=-1; // query for optimal size int JPVT[nn]; memset(JPVT, 0, sizeof JPVT); mad_alloc_tmp(num_t, rr, ldb*nm); mad_mat_copy(x, rr, m, p, p, ldb); // input strided copy [M x NRHS] dgelsy_(&np, &nn, &nm, a, &np, rr, &ldb, JPVT, &rcond, &rank, &sz, &lwork, &info); // query mad_alloc_tmp(num_t, wk, lwork=sz); dgelsy_(&np, &nn, &nm, a, &np, rr, &ldb, JPVT, &rcond, &rank, wk, &lwork, &info); // compute mad_mat_copy(rr, r, m, n, ldb, n); // output strided copy [N x NRHS] mad_free_tmp(wk); mad_free_tmp(rr); mad_free_tmp(a); if (info < 0) error("Div: invalid input argument"); if (info > 0) error("Div: unexpected lapack error"); return rank; } int mad_mat_divm (const num_t x[], const cpx_t y[], cpx_t r[], ssz_t m, ssz_t n, ssz_t p, num_t rcond) { CHKXYR; int info=0; const int nm=m, nn=n, np=p; mad_alloc_tmp(cpx_t, a, n*p); mad_cvec_copy(y, a, n*p); // square system (y is square, n == p), use LU decomposition if (n == p) { int ipiv[n]; mad_vec_copyv(x, r, m*p); zgesv_(&np, &nm, a, &np, ipiv, r, &np, &info); if (!info) return mad_free_tmp(a), n; if (info > 0) warn("Div: singular matrix, no solution found"); } // non-square system or singular square system, use QR or LQ factorization cpx_t sz; num_t rwk[2*nn]; int rank, ldb=MAX(nn,np), lwork=-1; // query for optimal size int JPVT[nn]; memset(JPVT, 0, sizeof JPVT); mad_alloc_tmp(cpx_t, rr, ldb*nm); mad_mat_copym(x, rr, m, p, p, ldb); // input strided copy [M x NRHS] zgelsy_(&np, &nn, &nm, a, &np, rr, &ldb, JPVT, &rcond, &rank, &sz, &lwork, rwk, &info); // query mad_alloc_tmp(cpx_t, wk, lwork=creal(sz)); zgelsy_(&np, &nn, &nm, a, &np, rr, &ldb, JPVT, &rcond, &rank, wk, &lwork, rwk, &info); // compute mad_cmat_copy(rr, r, m, n, ldb, n); // output strided copy [N x NRHS] mad_free_tmp(wk); mad_free_tmp(rr); mad_free_tmp(a); if (info < 0) error("Div: invalid input argument"); if (info > 0) error("Div: unexpected lapack error"); return rank; } int mad_cmat_div (const cpx_t x[], const cpx_t y[], cpx_t r[], ssz_t m, ssz_t n, ssz_t p, num_t rcond) { CHKXYR; int info=0; const int nm=m, nn=n, np=p; mad_alloc_tmp(cpx_t, a, n*p); mad_cvec_copy(y, a, n*p); // square system (y is square, n == p), use LU decomposition if (n == p) { int ipiv[n]; mad_cvec_copy(x, r, m*p); zgesv_(&np, &nm, a, &np, ipiv, r, &np, &info); if (!info) return mad_free_tmp(a), n; if (info > 0) warn("Div: singular matrix, no solution found"); } // non-square system or singular square system, use QR or LQ factorization cpx_t sz; num_t rwk[2*nn]; int rank, ldb=MAX(nn,np), lwork=-1; // query for optimal size int JPVT[nn]; memset(JPVT, 0, sizeof JPVT); mad_alloc_tmp(cpx_t, rr, ldb*nm); mad_cmat_copy(x, rr, m, p, p, ldb); // input strided copy [M x NRHS] zgelsy_(&np, &nn, &nm, a, &np, rr, &ldb, JPVT, &rcond, &rank, &sz, &lwork, rwk, &info); // query mad_alloc_tmp(cpx_t, wk, lwork=creal(sz)); zgelsy_(&np, &nn, &nm, a, &np, rr, &ldb, JPVT, &rcond, &rank, wk, &lwork, rwk, &info); // compute mad_cmat_copy(rr, r, m, n, ldb, n); // output strided copy [N x NRHS] mad_free_tmp(wk); mad_free_tmp(rr); mad_free_tmp(a); if (info < 0) error("Div: invalid input argument"); if (info > 0) error("Div: unexpected lapack error"); return rank; } int mad_cmat_divm (const cpx_t x[], const num_t y[], cpx_t r[], ssz_t m, ssz_t n, ssz_t p, num_t rcond) { CHKXYR; int info=0; const int nm=m, nn=n, np=p; mad_alloc_tmp(cpx_t, a, n*p); mad_vec_copyv(y, a, n*p); // square system (y is square, n == p), use LU decomposition if (n == p) { int ipiv[n]; mad_cvec_copy(x, r, m*p); zgesv_(&np, &nm, a, &np, ipiv, r, &np, &info); if (!info) return mad_free_tmp(a), n; if (info > 0) warn("Div: singular matrix, no solution found"); } // non-square system or singular square system, use QR or LQ factorization cpx_t sz; num_t rwk[2*nn]; int rank, ldb=MAX(nn,np), lwork=-1; // query for optimal size int JPVT[nn]; memset(JPVT, 0, sizeof JPVT); mad_alloc_tmp(cpx_t, rr, ldb*nm); mad_cmat_copy(x, rr, m, p, p, ldb); // input strided copy [M x NRHS] zgelsy_(&np, &nn, &nm, a, &np, rr, &ldb, JPVT, &rcond, &rank, &sz, &lwork, rwk, &info); // query mad_alloc_tmp(cpx_t, wk, lwork=creal(sz)); zgelsy_(&np, &nn, &nm, a, &np, rr, &ldb, JPVT, &rcond, &rank, wk, &lwork, rwk, &info); // compute mad_cmat_copy(rr, r, m, n, ldb, n); // output strided copy [N x NRHS] mad_free_tmp(wk); mad_free_tmp(rr); mad_free_tmp(a); if (info < 0) error("Div: invalid input argument"); if (info > 0) error("Div: unexpected lapack error"); return rank; } // -- SVD ---------------------------------------------------------------------o // SVD decomposition A = U * S * V.t() // A:[m x n], U:[m x m], S:[min(m,n)], V:[n x n] int mad_mat_svd (const num_t x[], num_t u[], num_t s[], num_t v[], ssz_t m, ssz_t n) { assert( x && u && s && v ); int info=0; const int nm=m, nn=n, mn=MIN(m,n); num_t sz; int lwork=-1; int iwk[8*mn]; mad_alloc_tmp(num_t, ra, m*n); mad_mat_trans(x, ra, m, n); dgesdd_("A", &nm, &nn, ra, &nm, s, u, &nm, v, &nn, &sz, &lwork, iwk, &info); // query mad_alloc_tmp(num_t, wk, lwork=sz); dgesdd_("A", &nm, &nn, ra, &nm, s, u, &nm, v, &nn, wk, &lwork, iwk, &info); // compute mad_free_tmp(wk); mad_free_tmp(ra); mad_mat_trans(u, u, m, m); if (info < 0) error("SVD: invalid input argument"); if (info > 0) warn ("SVD: failed to converge"); return info; } int mad_cmat_svd (const cpx_t x[], cpx_t u[], num_t s[], cpx_t v[], ssz_t m, ssz_t n) { assert( x && u && s && v ); int info=0; const int nm=m, nn=n, mn=MIN(m,n); cpx_t sz; int lwork=-1; int iwk[8*mn]; ssz_t rwk_sz = mn * MAX(5*mn+7, 2*MAX(m,n)+2*mn+1); mad_alloc_tmp(num_t, rwk, rwk_sz); mad_alloc_tmp(cpx_t, ra, m*n); mad_cmat_trans(x, ra, m, n); zgesdd_("A", &nm, &nn, ra, &nm, s, u, &nm, v, &nn, &sz, &lwork, rwk, iwk, &info); // query mad_alloc_tmp(cpx_t, wk, lwork=creal(sz)); zgesdd_("A", &nm, &nn, ra, &nm, s, u, &nm, v, &nn, wk, &lwork, rwk, iwk, &info); // compute mad_free_tmp(wk); mad_free_tmp(ra); mad_free_tmp(rwk); mad_cmat_trans(u, u, m, m); mad_cvec_conj (v, v, n*n); if (info < 0) error("SVD: invalid input argument"); if (info > 0) warn ("SVD: failed to converge"); return info; } // -- LEAST SQUARE Solvers ----------------------------------------------------o int mad_mat_solve (const num_t a[], const num_t b[], num_t x[], ssz_t m, ssz_t n, ssz_t p, num_t rcond) { assert( a && b && x ); int info=0; const int nm=m, nn=n, np=p, mn=MAX(m,n); num_t sz; int lwork=-1, rank; int pvt[nn]; memset(pvt, 0, sizeof pvt); mad_alloc_tmp(num_t, ta, m*n); mad_alloc_tmp(num_t, tb, mn*p); mad_vec_fill(0, tb+m*p, (mn-m)*p); mad_vec_copy (b , tb, m*p); mad_mat_trans(tb, tb, mn, p); mad_mat_trans(a , ta, m , n); dgelsy_(&nm, &nn, &np, ta, &nm, tb, &mn, pvt, &rcond, &rank, &sz, &lwork, &info); // query mad_alloc_tmp(num_t, wk, lwork=sz); dgelsy_(&nm, &nn, &np, ta, &nm, tb, &mn, pvt, &rcond, &rank, wk, &lwork, &info); // compute mad_mat_trans(tb, tb, p, mn); mad_vec_copy (tb, x, n*p); mad_free_tmp(wk); mad_free_tmp(ta); mad_free_tmp(tb); if (info < 0) error("Solve: invalid input argument"); if (info > 0) warn ("Solve: unexpected lapack error"); return rank; } int mad_cmat_solve (const cpx_t a[], const cpx_t b[], cpx_t x[], ssz_t m, ssz_t n, ssz_t p, num_t rcond) { assert( a && b && x ); int info=0; const int nm=m, nn=n, np=p, mn=MAX(m,n); cpx_t sz; num_t rwk[2*nn]; int lwork=-1, rank; int pvt[nn]; memset(pvt, 0, sizeof pvt); mad_alloc_tmp(cpx_t, ta, m*n); mad_alloc_tmp(cpx_t, tb, mn*p); mad_cvec_fill(0, tb+m*p, (mn-m)*p); mad_cvec_copy (b , tb, m*p); mad_cmat_trans(tb, tb, mn, p); mad_cmat_trans(a , ta, m , n); zgelsy_(&nm, &nn, &np, ta, &nm, tb, &mn, pvt, &rcond, &rank, &sz, &lwork, rwk, &info); // query mad_alloc_tmp(cpx_t, wk, lwork=creal(sz)); zgelsy_(&nm, &nn, &np, ta, &nm, tb, &mn, pvt, &rcond, &rank, wk, &lwork, rwk, &info); // compute mad_cmat_trans(tb, tb, p, mn); mad_cvec_copy (tb, x, n*p); mad_free_tmp(wk); mad_free_tmp(ta); mad_free_tmp(tb); if (info < 0) error("Solve: invalid input argument"); if (info > 0) warn ("Solve: unexpected lapack error"); return rank; } int mad_mat_ssolve (const num_t a[], const num_t b[], num_t x[], ssz_t m, ssz_t n, ssz_t p, num_t rcond, num_t s_[]) { assert( a && b && x ); int info=0; const int nm=m, nn=n, np=p, mn=MAX(m,n); num_t sz; int lwork=-1, rank, isz; mad_alloc_tmp(num_t, ta, m *n); mad_alloc_tmp(num_t, tb, mn*p); mad_alloc_tmp(num_t, ts, MIN(m,n)); mad_vec_copy (b , tb, m*p); mad_vec_fill (0 , tb +m*p, (mn-m)*p); mad_mat_trans(tb, tb, mn, p); mad_mat_trans(a , ta, m , n); dgelsd_(&nm, &nn, &np, ta, &nm, tb, &mn, ts, &rcond, &rank, &sz, &lwork, &isz, &info); // query mad_alloc_tmp(num_t, wk, lwork=sz); mad_alloc_tmp(int , iwk, isz); dgelsd_(&nm, &nn, &np, ta, &nm, tb, &mn, ts, &rcond, &rank, wk, &lwork, iwk, &info); // compute mad_mat_trans(tb, tb, p, mn); mad_vec_copy (tb, x, n*p); if (s_) mad_vec_copy(ts, s_, MIN(m,n)); mad_free_tmp(wk); mad_free_tmp(iwk); mad_free_tmp(ta); mad_free_tmp(tb); mad_free_tmp(ts); if (info < 0) error("SSolve: invalid input argument"); if (info > 0) warn ("SSolve: failed to converge"); return rank; } int mad_cmat_ssolve (const cpx_t a[], const cpx_t b[], cpx_t x[], ssz_t m, ssz_t n, ssz_t p, num_t rcond, num_t s_[]) { assert( a && b && x ); int info=0; const int nm=m, nn=n, np=p, mn=MAX(m,n); num_t rsz; cpx_t sz; int lwork=-1, rank, isz; mad_alloc_tmp(cpx_t, ta, m*n); mad_alloc_tmp(cpx_t, tb, mn*p); mad_alloc_tmp(num_t, ts, MIN(m,n)); mad_cvec_copy (b , tb, m*p); mad_cvec_fill (0 , tb +m*p, (mn-m)*p); mad_cmat_trans(tb, tb, mn, p); mad_cmat_trans(a , ta, m , n); zgelsd_(&nm, &nn, &np, ta, &nm, tb, &mn, ts, &rcond, &rank, &sz, &lwork, &rsz, &isz, &info); // query mad_alloc_tmp(cpx_t, wk, lwork=creal(sz)); mad_alloc_tmp( num_t, rwk, (int)rsz); mad_alloc_tmp( int , iwk, isz); zgelsd_(&nm, &nn, &np, ta, &nm, tb, &mn, ts, &rcond, &rank, wk, &lwork, rwk, iwk, &info); // compute mad_cmat_trans(tb, tb, p, mn); mad_cvec_copy (tb, x, n*p); if (s_) mad_vec_copy(ts, s_, MIN(m,n)); mad_free_tmp(wk); mad_free_tmp(rwk); mad_free_tmp(iwk); mad_free_tmp(ta); mad_free_tmp(tb); mad_free_tmp(ts); if (info < 0) error("SSolve: invalid input argument"); if (info > 0) warn ("SSolve: failed to converge"); return rank; } // -- Generalized LS Solvers --------------------------------------------------o int mad_mat_gsolve (const num_t a[], const num_t b[], const num_t c[], const num_t d[], num_t x[], ssz_t m, ssz_t n, ssz_t p, num_t *nrm_) { assert( a && b && x ); ensure( 0 <= p && p <= n && n <= m+p, "invalid system sizes" ); int info=0; const int nm=m, nn=n, np=p; num_t sz; int lwork=-1; mad_alloc_tmp(num_t, ta, m*n); mad_alloc_tmp(num_t, tb, p*n); mad_alloc_tmp(num_t, tc, m); mad_alloc_tmp(num_t, td, p); mad_mat_trans(a, ta, m, n); mad_mat_trans(b, tb, p, n); mad_vec_copy (c, tc, m); mad_vec_copy (d, td, p); dgglse_(&nm, &nn, &np, ta, &nm, tb, &np, tc, td, x, &sz, &lwork, &info); // query mad_alloc_tmp(num_t, wk, lwork=sz); dgglse_(&nm, &nn, &np, ta, &nm, tb, &np, tc, td, x, wk, &lwork, &info); // compute if (nrm_) *nrm_ = mad_vec_norm(tc+(n-p), m-(n-p)); // residues mad_free_tmp(wk); mad_free_tmp(ta); mad_free_tmp(tb); mad_free_tmp(tc); mad_free_tmp(td); if (info < 0) error("GSolve: invalid input argument"); if (info > 0) warn ("GSolve: [B A] is singular, no solution found"); return info; } int mad_cmat_gsolve (const cpx_t a[], const cpx_t b[], const cpx_t c[], const cpx_t d[], cpx_t x[], ssz_t m, ssz_t n, ssz_t p, num_t *nrm_) { assert( a && b && x ); ensure( 0 <= p && p <= n && n <= m+p, "invalid system sizes" ); int info=0; const int nm=m, nn=n, np=p; cpx_t sz; int lwork=-1; mad_alloc_tmp(cpx_t, ta, m*n); mad_alloc_tmp(cpx_t, tb, p*n); mad_alloc_tmp(cpx_t, tc, m); mad_alloc_tmp(cpx_t, td, p); mad_cmat_trans(a, ta, m, n); mad_cmat_trans(b, tb, p, n); mad_cvec_copy (c, tc, m); mad_cvec_copy (d, td, p); zgglse_(&nm, &nn, &np, ta, &nm, tb, &np, tc, td, x, &sz, &lwork, &info); // query mad_alloc_tmp(cpx_t, wk, lwork=sz); zgglse_(&nm, &nn, &np, ta, &nm, tb, &np, tc, td, x, wk, &lwork, &info); // compute if (nrm_) *nrm_ = mad_cvec_norm(tc+(n-p), m-(n-p)); // residues mad_free_tmp(wk); mad_free_tmp(ta); mad_free_tmp(tb); mad_free_tmp(tc); mad_free_tmp(td); if (info < 0) error("GSolve: invalid input argument"); if (info > 0) warn ("GSolve: [B A] is singular, no solution found"); return info; } int mad_mat_gmsolve (const num_t a[], const num_t b[], const num_t d[], num_t x[], num_t y[], ssz_t m, ssz_t n, ssz_t p) { assert( a && b && x ); ensure( 0 <= p && n <= m && m <= n+p, "invalid system sizes" ); int info=0; const int nm=m, nn=n, np=p; num_t sz; int lwork=-1; mad_alloc_tmp(num_t, ta, m*n); mad_alloc_tmp(num_t, tb, m*p); mad_alloc_tmp(num_t, td, m); mad_mat_trans(a, ta, m, n); mad_mat_trans(b, tb, m, p); mad_vec_copy (d, td, m); dggglm_(&nm, &nn, &np, ta, &nm, tb, &nm, td, x, y, &sz, &lwork, &info); // query mad_alloc_tmp(num_t, wk, lwork=sz); dggglm_(&nm, &nn, &np, ta, &nm, tb, &nm, td, x, y, wk, &lwork, &info); // compute mad_free_tmp(wk); mad_free_tmp(ta); mad_free_tmp(tb); mad_free_tmp(td); if (info < 0) error("GMSolve: invalid input argument"); if (info > 0) warn ("GMSolve: [A B] is singular, no solution found"); return info; } int mad_cmat_gmsolve (const cpx_t a[], const cpx_t b[], const cpx_t d[], cpx_t x[], cpx_t y[], ssz_t m, ssz_t n, ssz_t p) { assert( a && b && x ); ensure( 0 <= p && n <= m && m <= n+p, "invalid system sizes" ); int info=0; const int nm=m, nn=n, np=p; cpx_t sz; int lwork=-1; mad_alloc_tmp(cpx_t, ta, m*n); mad_alloc_tmp(cpx_t, tb, m*p); mad_alloc_tmp(cpx_t, td, m); mad_cmat_trans(a, ta, m, n); mad_cmat_trans(b, tb, m, p); mad_cvec_copy (d, td, m); zggglm_(&nm, &nn, &np, ta, &nm, tb, &nm, td, x, y, &sz, &lwork, &info); // query mad_alloc_tmp(cpx_t, wk, lwork=sz); zggglm_(&nm, &nn, &np, ta, &nm, tb, &nm, td, x, y, wk, &lwork, &info); // compute mad_free_tmp(wk); mad_free_tmp(ta); mad_free_tmp(tb); mad_free_tmp(td); if (info < 0) error("GMSolve: invalid input argument"); if (info > 0) warn ("GMSolve: [A B] is singular, no solution found"); return info; } // -- EIGEN -------------------------------------------------------------------o // Eigen values and vectors // A:[n x n], U:[m x m], S:[min(m,n)], V:[n x n] int mad_mat_eigen (const num_t x[], cpx_t w[], num_t vl_[], num_t vr_[], ssz_t n) { assert( x && w ); int info=0; const int nn=n; const str_t vls = vl_ ? "V" : "N"; const str_t vrs = vr_ ? "V" : "N"; num_t sz; int lwork=-1; mad_alloc_tmp(num_t, wr, n); mad_alloc_tmp(num_t, wi, n); mad_alloc_tmp(num_t, ra, n*n); mad_mat_trans(x, ra, n, n); dgeev_(vls, vrs, &nn, ra, &nn, wr, wi, vl_, &nn, vr_, &nn, &sz, &lwork, &info); // query mad_alloc_tmp(num_t, wk, lwork=sz); dgeev_(vls, vrs, &nn, ra, &nn, wr, wi, vl_, &nn, vr_, &nn, wk, &lwork, &info); // compute mad_vec_cplx(wr, wi, w, n); mad_free_tmp(wk); mad_free_tmp(ra); mad_free_tmp(wi); mad_free_tmp(wr); //if (vl_) mad_mat_trans(vl_, vl_, n, n); if (vr_) mad_mat_trans(vr_, vr_, n, n); if (info < 0) error("Eigen: invalid input argument"); if (info > 0) warn ("Eigen: failed to compute all eigenvalues"); return info; } int mad_cmat_eigen (const cpx_t x[], cpx_t w[], cpx_t vl_[], cpx_t vr_[], ssz_t n) { assert( x && w ); int info=0; const int nn=n; const str_t vls = vl_ ? "V" : "N"; const str_t vrs = vr_ ? "V" : "N"; cpx_t sz; int lwork=-1; mad_alloc_tmp(num_t, rwk, 2*n); mad_alloc_tmp(cpx_t, ra, n*n); mad_cmat_trans(x, ra, n, n); zgeev_(vls, vrs, &nn, ra, &nn, w, vl_, &nn, vr_, &nn, &sz, &lwork, rwk, &info); // query mad_alloc_tmp(cpx_t, wk, lwork=creal(sz)); zgeev_(vls, vrs, &nn, ra, &nn, w, vl_, &nn, vr_, &nn, wk, &lwork, rwk, &info); // compute mad_free_tmp(wk); mad_free_tmp(ra); mad_free_tmp(rwk); //if (vl_) mad_cmat_trans(vl_, vl_, n, n); if (vr_) mad_cmat_trans(vr_, vr_, n, n); if (info < 0) error("Eigen: invalid input argument"); if (info > 0) warn ("Eigen: failed to compute all eigenvalues"); return info; } // -- GEOMETRY ----------------------------------------------------------------o // -- Helpers -----------------------------------------------------------------o #define NN (N*N) #define X(i,j) x[(i-1)*N+(j-1)] #define VCPY(src,dst) for(idx_t i=0; i -0.99999) { rr = sqrt(1+tt), ss = 0.5/rr; q[0] = 0.5*rr; if (!inv) { // normal q[1] = (X(3,2) - X(2,3)) * ss; q[2] = (X(1,3) - X(3,1)) * ss; q[3] = (X(2,1) - X(1,2)) * ss; } else { // transposed q[1] = (X(2,3) - X(3,2)) * ss; q[2] = (X(3,1) - X(1,3)) * ss; q[3] = (X(1,2) - X(2,1)) * ss; } return; } // look for more stable trace num_t m = MAX(xx, yy, zz); if (!inv) { // normal if (m == xx) { rr = sqrt(1+xx-yy-zz), ss = 0.5/rr; q[1] = 0.5*rr; q[0] = (X(3,2) - X(2,3)) * ss; q[2] = (X(1,3) + X(3,1)) * ss; q[3] = (X(2,1) + X(1,2)) * ss; } else if (m == yy) { rr = sqrt(1+yy-xx-zz), ss = 0.5/rr; q[2] = 0.5*rr; q[0] = (X(3,2) - X(2,3)) * ss; q[1] = (X(1,3) - X(3,1)) * ss; q[3] = (X(2,1) + X(1,2)) * ss; } else { rr = sqrt(1+zz-xx-yy), ss = 0.5/rr; q[3] = 0.5*rr; q[0] = (X(3,2) - X(2,3)) * ss; q[1] = (X(1,3) - X(3,1)) * ss; q[2] = (X(2,1) - X(1,2)) * ss; } } else { // transposed if (m == xx) { rr = sqrt(1+xx-yy-zz), ss = 0.5/rr; q[1] = 0.5*rr; q[0] = (X(2,3) - X(3,2)) * ss; q[2] = (X(3,1) + X(1,3)) * ss; q[3] = (X(1,2) + X(2,1)) * ss; } else if (m == yy) { rr = sqrt(1+yy-xx-zz), ss = 0.5/rr; q[2] = 0.5*rr; q[0] = (X(2,3) - X(3,2)) * ss; q[1] = (X(3,1) - X(1,3)) * ss; q[3] = (X(1,2) + X(2,1)) * ss; } else { rr = sqrt(1+zz-xx-yy), ss = 0.5/rr; q[3] = 0.5*rr; q[0] = (X(2,3) - X(3,2)) * ss; q[1] = (X(3,1) - X(1,3)) * ss; q[2] = (X(1,2) - X(2,1)) * ss; } } } #undef N #undef X // -- Orbit Correction --------------------------------------------------------o int mad_use_madx_micado = 0; int mad_use_madx_svdcnd = 0; extern void // see madx_micado.f90 micit_(num_t cin[], num_t res[], int nx[], num_t *rms, int *im, int *ic, int *iter, /* working buffers */ int ny[], num_t ax[], num_t cinx[], num_t xinx[], num_t resx[], num_t rho[], num_t ptop[], num_t rmss[], num_t xrms[], num_t xptp[], num_t xiter[], int *ifail); extern void svddec_(num_t svdmat[], num_t umat[], num_t vmat[], /* working buffers */ num_t ws[], num_t wvec[], int sortw[], num_t *sngcut, num_t *sngval, /* sizes and output */ int *im, int *ic, int *iflag, int sing[]); static void vec_sort (num_t v[], idx_t c[], ssz_t n) { num_t tr; idx_t ti; // Set indexes. FOR(i,n) c[i] = i; // Sort values by ascending order. FOR(i,1,n) RFOR(j,i+1) if (v[j-1] > v[j]) { SWAP(v[j-1], v[j], tr); SWAP(c[j-1], c[j], ti); } } static ssz_t ivec_sort (idx_t v[], ssz_t n, log_t rmdup) { idx_t ti; // Sort indexes by ascending order. FOR(i,1,n) RFOR(j,i+1) if (v[j-1] > v[j]) SWAP(v[j-1], v[j], ti); // Remove duplicates. if (rmdup) { idx_t k = 1; FOR(i,1,n) if (v[k-1] < v[i]) v[k++] = v[i]; return k; } return n; } static int // madx legacy code wrapper madx_svdcnd (const num_t a[], idx_t c[], ssz_t m, ssz_t n, num_t scut, num_t s_[], num_t sval) { /* copy buffers */ mad_alloc_tmp(num_t, A , m*n); mad_alloc_tmp(num_t, U , m*n); mad_alloc_tmp(num_t, V , n*n); mad_alloc_tmp(num_t, S , n ); /* working buffers */ mad_alloc_tmp(num_t, W , n ); mad_alloc_tmp(idx_t, srt, n ); mad_alloc_tmp(idx_t, sng, 2*n); mad_mat_trans(a, A, m, n); int im=m, ic=n, nc=0; svddec_(A, U, V, W, S, srt, &scut, &sval, &im, &ic, &nc, sng); // Backup singular values. if (s_) mad_vec_copy(S, s_, MIN(m,n)); // Backup indexes of columns to remove. FOR(i,nc) c[i] = sng[2*i]; /* copy buffers */ mad_free_tmp(A); mad_free_tmp(U); mad_free_tmp(V); mad_free_tmp(S); /* working buffers */ mad_free_tmp(W); mad_free_tmp(srt); mad_free_tmp(sng); // Return sorted indexes of columns to remove. return ivec_sort(c, nc, true); } int // Matrix preconditionning using SVD, return indexes of columns to remove. mad_mat_svdcnd(const num_t a[], idx_t c[], ssz_t m, ssz_t n, ssz_t N, num_t rcond, num_t s_[], num_t tol) { assert(a && c); // X-check with MAD-X SVD cond (where N = n-5) if (mad_use_madx_svdcnd) { return madx_svdcnd(a, c, m, n, rcond, s_, 1/tol); } ssz_t mn = MIN(m,n); mad_alloc_tmp(num_t, U, m*m); mad_alloc_tmp(num_t, V, n*n); mad_alloc_tmp(num_t, S, mn ); int info = mad_mat_svd(a, U, S, V, m, n); if (info != 0) return -1; // Backup singular values. if (s_) mad_vec_copy(S, s_, mn); // N == 0 means to check for all singular values. if (N > mn || N <= 0) N = mn; // Tolerance on components similarity in V columns. tol = MAX(tol, DBL_EPSILON); // Tolerance on keeping singular values. rcond = MAX(rcond, DBL_EPSILON); // Number of columns to remove. idx_t nc = 0; #define V(i,j) V[(i)*n+(j)] // Loop over increasing singular values. RFOR(i,mn,mn-N) { // Singular value is large, stop checking. if (S[i] > rcond*S[0]) break; // Loop over rows of V (i.e. columns of V^T) FOR(j,n-1) FOR(k,j+1,n) { num_t vj = fabs(V(j,i)); // Proceed only significant component for this singular value. if (vj > 1e-4) { num_t vk = fabs(V(k,i)); num_t rat = fabs(vj-vk)/(vj+vk); // Discard column j with similar (or opposite) effect of column k > j. if (rat <= tol) { c[nc++] = j; // can hold duplicated indexes... if (nc == n) goto finalize; // c is full... } } } } #undef V finalize: mad_free_tmp(U); mad_free_tmp(V); mad_free_tmp(S); // Return sorted indexes of columns to remove. return ivec_sort(c, nc, true); } int // Matrix preconditionning using SVD, return indexes of columns to remove. mad_cmat_svdcnd(const cpx_t a[], idx_t c[], ssz_t m, ssz_t n, ssz_t N, num_t rcond, num_t s_[], num_t tol) { assert(a && c); ssz_t mn = MIN(m,n); mad_alloc_tmp(cpx_t, U, m*m); mad_alloc_tmp(cpx_t, V, n*n); mad_alloc_tmp(num_t, S, mn ); int info = mad_cmat_svd(a, U, S, V, m, n); if (info != 0) return -1; // Backup singular values. if (s_) mad_vec_copy(S, s_, mn); // N == 0 means to check for all singular values. if (N > mn || N <= 0) N = mn; // Tolerance on components similarity in V columns. tol = MAX(tol, DBL_EPSILON); // Tolerance on keeping singular values. rcond = MAX(rcond, DBL_EPSILON); // Number of columns to remove. idx_t nc = 0; #define V(i,j) V[(i)*n+(j)] // Loop over increasing singular values. RFOR(i,mn,mn-N) { // Singular value is large, stop checking. if (S[i] > rcond*S[0]) break; // Loop over rows of V (i.e. columns of V^T) FOR(j,n-1) FOR(k,j+1,n) { num_t vj = cabs(V(j,i)); // Proceed only significant component for this singular value. if (vj > 1e-4) { num_t vk = cabs(V(k,i)); num_t rat = fabs(vj-vk)/(vj+vk); // Discard column j with similar (or opposite) effect of column k > j. if (rat <= tol) { c[nc++] = j; // can hold duplicated indexes... if (nc == n) goto finalize; // c is full... } } } } #undef V finalize: mad_free_tmp(U); mad_free_tmp(V); mad_free_tmp(S); // Return sorted indexes of columns to remove. return ivec_sort(c, nc, true); } int // Matrix reconditionning using SVD. mad_mat_pcacnd(const num_t a[], idx_t c[], ssz_t m, ssz_t n, ssz_t N, num_t rcond, num_t s_[]) { assert(a && c); ssz_t mn = MIN(m,n); mad_alloc_tmp(num_t, U, m*m); mad_alloc_tmp(num_t, V, n*n); mad_alloc_tmp(num_t, S, mn ); mad_alloc_tmp(num_t, P, n ); int info = mad_mat_svd(a, U, S, V, m, n); if (info != 0) return -1; // Backup singular values. if (s_) mad_vec_copy(S, s_, mn); // N <= 0 means keep all columns. if (N > n || N <= 0) N = n; // Tolerance on keeping singular values. rcond = MAX(rcond, DBL_EPSILON); FOR(i,N) if (S[i] <= rcond*S[0]) { N=i; break; } // Compute projections on Principal Components, i.e. S V. mad_vec_abs(V, V, N*n); mad_mat_mul(S, V, P, 1, n, N); // Sort projections by ascending order. vec_sort(P, c, n); mad_free_tmp(U); mad_free_tmp(V); mad_free_tmp(S); mad_free_tmp(P); // Return sorted indexes of columns to remove. return ivec_sort(c, n-N, false); } int // Matrix reconditionning using SVD. mad_cmat_pcacnd(const cpx_t a[], idx_t c[], ssz_t m, ssz_t n, ssz_t N, num_t rcond, num_t s_[]) { assert(a && c); ssz_t mn = MIN(m,n); mad_alloc_tmp(cpx_t, U, m*m); mad_alloc_tmp(cpx_t, V, n*n); mad_alloc_tmp(num_t, R, n*n); mad_alloc_tmp(num_t, S, mn ); mad_alloc_tmp(num_t, P, n ); int info = mad_cmat_svd(a, U, S, V, m, n); if (info != 0) return -1; // Backup singular values. if (s_) mad_vec_copy(S, s_, mn); // N <= 0 means keep all columns. if (N > n || N <= 0) N = n; // Tolerance on keeping singular values. rcond = MAX(rcond, DBL_EPSILON); FOR(i,N) if (S[i] <= rcond*S[0]) { N=i; break; } // Compute projections on Principal Components, i.e. S V. mad_cvec_abs(V, R, N*n); mad_mat_mul (S, R, P, 1, n, N); // Sort projections by ascending order. vec_sort(P, c, n); mad_free_tmp(U); mad_free_tmp(V); mad_free_tmp(R); mad_free_tmp(S); mad_free_tmp(P); // Return sorted indexes of columns to remove. return ivec_sort(c, n-N, false); } static int // madx legacy code wrapper madx_micado (const num_t a[], const num_t b[], num_t x[], ssz_t m, ssz_t n, ssz_t N, num_t tol, num_t r_[]) { /* copy buffers */ mad_alloc_tmp(num_t, X , n); mad_alloc_tmp(num_t, R , m); /* working buffers */ mad_alloc_tmp(idx_t, nx , n); mad_alloc_tmp(idx_t, ny , n); mad_alloc_tmp(num_t, ax , m*n); mad_alloc_tmp(num_t, cinx, n); mad_alloc_tmp(num_t, xinx, m); mad_alloc_tmp(num_t, resx, m); mad_alloc_tmp(num_t, rho , 3*n); mad_alloc_tmp(num_t, ptop, n); mad_alloc_tmp(num_t, rmss, n); mad_alloc_tmp(num_t, xrms, n); mad_alloc_tmp(num_t, xptp, n); mad_alloc_tmp(num_t, xitr, n); mad_mat_trans(a, ax , m, n); mad_vec_copy (b, xinx, m); mad_vec_fill (0, x , n); int im=m, ic=n, iter=N, ifail=0; num_t rms=tol; micit_(X, R, nx, &rms, &im, &ic, &iter, /* working buffers */ ny, ax, cinx, xinx, resx, rho, ptop, rmss, xrms, xptp, xitr, &ifail); // Re-order corrector strengths and save residues. Strengths are not minused! FOR(i,iter) x[i] = -X[nx[i]-1]; if (r_) mad_vec_copy(R, r_, m); /* copy buffers */ mad_free_tmp(X); mad_free_tmp(R); /* working buffers */ mad_free_tmp(nx); mad_free_tmp(ny); mad_free_tmp(ax); mad_free_tmp(cinx); mad_free_tmp(xinx); mad_free_tmp(resx); mad_free_tmp(rho); mad_free_tmp(ptop); mad_free_tmp(rmss); mad_free_tmp(xrms); mad_free_tmp(xptp); mad_free_tmp(xitr); return iter; } /* Reference: B. Autin, and Y. Marti, "Closed Orbit Correction of A.G. Machines Using A Small Number of Magnets", CERN ISR-MA/73-17, 1973. */ int // Micado (from MAD9 + bug fixes) mad_mat_nsolve(const num_t a[], const num_t b[], num_t x[], ssz_t m, ssz_t n, ssz_t N, num_t tol, num_t r_[]) { assert(a && b && x); // Micado notes: min_x || b - Ax ||_2 using N correctors amongst n // a: response matrix [m x n] // b: vector of monitors [m] // x: vector of correctors [n] (out) // r: residues [m] (out) // N: number of correctors to use 0 < N <= n (out: actually used) mad_vec_fill(0, x, n); // No correctors. if (n == 0) return 0; if (tol < DBL_EPSILON) tol = DBL_EPSILON; // Checks if tolerance is already reached. { num_t e = mad_vec_norm(b, m) / m; if (e <= tol) return 0; } // N == 0 means to use all correctors if (N > n || N <= 0) N = n; // X-check with MAD-X Micado if (mad_use_madx_micado) { return madx_micado(a, b, x, m, n, N, tol, r_); } mad_alloc_tmp(num_t, A , m*n); mad_alloc_tmp(num_t, B , m); mad_alloc_tmp(num_t, X , n); mad_alloc_tmp(num_t, R , m); mad_alloc_tmp(num_t, sqr, n); // rho[k] = dot[k]^2/sqr[k] mad_alloc_tmp(num_t, dot, n); mad_alloc_tmp(idx_t, pvt, n); mad_vec_copy(a, A, m*n); mad_vec_copy(b, B, m); mad_vec_fill(0, X, n); mad_vec_fill(0, R, m); #define A(i,j) A[(i)*n+(j)] // Box 3: Compute scalar products sqr[k] = A[k].A[k] and dot[k] = A[k].B. num_t sqrmin = 0; { num_t sum = 0; FOR(k,n) { pvt[k] = k; num_t hh = 0, gg = 0; FOR(i,m) { hh += A(i,k) * A(i,k); // corrector effectiveness versus measured orbit gg += A(i,k) * B[i]; // corrector effectiveness versus target orbit } sum += hh; sqr[k] = hh; dot[k] = gg; } sqrmin = 1e-8 * sum / n; // 1e-8 * average of correctors effectiveness } // Begin of iteration (l): loop over best-kick selection (i.e. A columns). FOR(k,N) { // Box 3: Search the columns not yet used for largest scaled change vector. { num_t maxChange = 0; idx_t changeIndex = -1; FOR(j,k,n) { if (sqr[j] > sqrmin) { // criteria rho that minimize the residues num_t change = dot[j]*dot[j] / sqr[j]; if (change > maxChange) { changeIndex = j; maxChange = change; } } } // Stop iterations if no suitable column are found. if (changeIndex < 0) { N=k; break; } // Move the column just found to next position. if (changeIndex > k) { num_t tr; idx_t ti; SWAP(sqr[k], sqr[changeIndex], tr); SWAP(dot[k], dot[changeIndex], tr); SWAP(pvt[k], pvt[changeIndex], ti); FOR(i,m) SWAP(A(i,k), A(i,changeIndex), tr); } } // Box 4: Compute beta, sigma, and vector u[k]. num_t beta, hh; { hh = 0; FOR(i,k,m) hh += A(i,k) * A(i,k); num_t sigma = A(k,k) < 0 ? -sqrt(hh) : sqrt(hh); sqr[k] = -sigma; // saved for use in X[1..k] update A(k,k) += sigma; beta = 1 / (A(k,k) * sigma); } // Box 5: Transform remaining columns of A. FOR(j,k+1,n) { hh = 0; FOR(i,k,m) hh += A(i,k) * A(i,j); hh *= beta; FOR(i,k,m) A(i,j) -= A(i,k) * hh; } // Box 6: Transform vector b. hh = 0; FOR(i,k,m) hh += A(i,k) * B[i]; hh *= beta; FOR(i,k,m) B[i] -= A(i,k) * hh; // Box 3: Update scalar products sqr[j]=A[j]*A[j] and dot[j]=A[j]*b. FOR(j,k+1,n) { sqr[j] -= A(k,j) * A(k,j); dot[j] -= A(k,j) * B[k]; } // Box 7: Recalculate solution vector x. Here, sqr[1..k] = -sigma[1..k]. RFOR(i,k+1) { X[i] = B[i]; FOR(j,i+1,k+1) X[i] -= A(i,j) * X[j]; X[i] /= sqr[i]; } // Box 8: Compute original residual vector by backward transformation. mad_vec_copy(B, R, m); RFOR(j,k+1) { R[j] = hh = 0; FOR(i,j,m) hh += A(i,j) * R[i]; hh /= sqr[j] * A(j,j); FOR(i,j,m) R[i] += A(i,j) * hh; } // Box 9: Check for convergence. num_t e = mad_vec_norm(R, m) / m; if (e <= tol) { N=k+1; break; } } #undef A // Re-order corrector strengths and save residues. FOR(i,N) x[pvt[i]] = X[i]; if (r_) mad_vec_copy(R, r_, m); mad_free_tmp(A); mad_free_tmp(B); mad_free_tmp(X); mad_free_tmp(R); mad_free_tmp(sqr); mad_free_tmp(dot); mad_free_tmp(pvt); return N; } // -- Survey Misalignments ----------------------------------------------------o #define N 3 // Fast computation of Rbar and Tbar for misalignments at element exit (forward) // Tbar = W^t (RV+T-V) // Rbar = W^t R W #include "mad_cst.h" void mad_mat_rtbar (num_t Rb[NN], num_t Tb[N], num_t el, num_t ang, num_t tlt, const num_t R_[NN], const num_t T [N]) { assert(Rb && Tb && T); if (fabs(ang) < mad_cst_MINANG) { // -- straight ------------------o if (R_) { num_t Ve[N] = {0, 0, el}; // We = I mad_mat_mul (R_, Ve, Tb, N, 1, N); mad_vec_sub (Tb, Ve, Tb, N); mad_vec_add (Tb, T , Tb, N); // Tb = R*Ve + T - Ve mad_vec_copy(R_, Rb, NN); // Rb = R } else { // R = I mad_vec_copy(T, Tb, N); // Tb = T mad_mat_eye (Rb, 1, N, N, N); // Rb = I } } else { // -- curved --------------------o num_t rho = el/ang; num_t Ve[N] = {rho*(cos(ang)-1), 0, rho*sin(ang)}; num_t We[NN]; mad_mat_roty(We, -ang); num_t Wt[NN]; // tilt if (fabs(tlt) >= mad_cst_MINANG) { num_t Rz[NN]; mad_mat_rotz(Rz, tlt); mad_mat_mul (Rz, Ve, Ve, N, 1, N); // Ve = Rz*Ve mad_mat_mul (Rz, We, Wt, N, N, N); mad_mat_mult(Wt, Rz, We, N, N, N); // We = Rz*We*Rz:t() } if (R_) { num_t Vt[N]; mad_mat_mul (R_, Ve, Vt, N, 1, N); mad_vec_sub (Vt, Ve, Vt, N); mad_vec_add (Vt, T , Vt, N); mad_mat_tmul(We, Vt, Tb, N, 1, N); // Tb = We:t()*(R*Ve + T - Ve) mad_mat_tmul(We, R_, Wt, N, N, N); mad_mat_mul (Wt, We, Rb, N, N, N); // Rb = We:t()*R*We } else { // R = I mad_mat_tmul(We, T , Tb, N, 1, N); // Tb = We:t()*T mad_mat_eye (Rb, 1, N, N, N); // Rb = I } } } #undef N