From 176364e30ebf545fbedee96070dad388d4144fb0 Mon Sep 17 00:00:00 2001
From: Claus Fieker <fieker@mathematik.uni-kl.de>
Date: Thu, 9 Nov 2023 16:47:06 +0100
Subject: [PATCH 1/5] seems to work...

  - adaptive ratrec for dixon
  - dixon in "julia"
  - solve_left for odd dim
---
 examples/strassen.jl | 710 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 700 insertions(+), 10 deletions(-)

diff --git a/examples/strassen.jl b/examples/strassen.jl
index c9ce8cd293..6cd844d8ed 100644
--- a/examples/strassen.jl
+++ b/examples/strassen.jl
@@ -18,7 +18,7 @@ module Strassen
 using Hecke
 import Hecke.AbstractAlgebra, Hecke.Nemo
 
-const cutoff = 64
+const cutoff = 1500
 
 function Nemo.mul!(C::AbstractArray, A::AbstractArray, B::AbstractArray, add::Bool = false)
   @assert size(C) == (2,2) && size(A) == (2,2) && size(B) == (2,2)
@@ -197,7 +197,33 @@ function mul_strassen!(A::Generic.Mat, B::Generic.Mat, C::Generic.Mat)
   mul_strassen!(C.entries, A.entries, B.entries)
 end
 
-function mul_strassen!(C::MatElem{T}, A::MatElem{T}, B::MatElem{T}) where {T}
+function zero!(A::MatElem{T}) where {T}
+  error("not done yet")
+end
+
+function zero!(A::fpMatrix)
+  ccall((:nmod_mat_zero, Nemo.libflint), Cvoid, (Ref{fpMatrix}, ), A)
+end
+
+function zero!(A::ZZMatrix)
+  ccall((:fmpz_mat_zero, Nemo.libflint), Cvoid, (Ref{ZZMatrix}, ), A)
+end
+
+function sub!(A::ZZMatrix, B::ZZMatrix, C::ZZMatrix)
+  ccall((:fmpz_mat_sub, Nemo.libflint), Cvoid, (Ref{ZZMatrix}, Ref{ZZMatrix}, Ref{ZZMatrix}), A, B, C)
+end
+
+function sub!(A::fpMatrix, B::fpMatrix, C::fpMatrix)
+  ccall((:fmpz_mat_sub, Nemo.libflint), Cvoid, (Ref{fpMatrix}, Ref{fpMatrix}, Ref{fpMatrix}), A, B, C)
+end
+
+function mul_strassen(A::MatElem{T}, B::MatElem{T}; cutoff::Int = cutoff) where {T}
+  C = zero_matrix(base_ring(A), nrows(A), ncols(B))
+  mul_strassen!(C, A, B; cutoff)
+  return C
+end
+
+function mul_strassen!(C::MatElem{T}, A::MatElem{T}, B::MatElem{T}; cutoff::Int = cutoff) where {T}
   sA = size(A)
   sB = size(B)
   sC = size(C)
@@ -259,26 +285,26 @@ function mul_strassen!(C::MatElem{T}, A::MatElem{T}, B::MatElem{T}) where {T}
   X1 = A11 - A21
   X2 = B22 - B12
   #nmod_mat_mul(C21, X1, X2);
-  mul_strassen!(C21, X1, X2)
+  mul_strassen!(C21, X1, X2; cutoff)
 
   add!(X1, A21, A22);
   sub!(X2, B12, B11);
   #nmod_mat_mul(C22, X1, X2);
-  mul_strassen!(C22, X1, X2)
+  mul_strassen!(C22, X1, X2; cutoff)
 
   sub!(X1, X1, A11);
   sub!(X2, B22, X2);
   #nmod_mat_mul(C12, X1, X2);
-  mul_strassen!(C12, X1, X2)
+  mul_strassen!(C12, X1, X2; cutoff)
 
   sub!(X1, A12, X1);
   #nmod_mat_mul(C11, X1, B22);
-  mul_strassen!(C11, X1, B22)
+  mul_strassen!(C11, X1, B22; cutoff)
 
   #X1->c = bnc;
   #nmod_mat_mul(X1, A11, B11);
-  X1 = similar(C11)
-  mul_strassen!(X1, A11, B11)
+  zero!(X1)
+  mul_strassen!(X1, A11, B11; cutoff)
 
   add!(C12, X1, C12);
   add!(C21, C12, C21);
@@ -287,12 +313,12 @@ function mul_strassen!(C::MatElem{T}, A::MatElem{T}, B::MatElem{T}) where {T}
   add!(C12, C12, C11);
   sub!(X2, X2, B21);
   #nmod_mat_mul(C11, A22, X2);
-  mul_strassen!(C11, A22, X2)
+  mul_strassen!(C11, A22, X2; cutoff)
 
   sub!(C21, C21, C11);
 
   #nmod_mat_mul(C11, A12, B21);
-  mul_strassen!(C11, A12, B21)
+  mul_strassen!(C11, A12, B21; cutoff)
 
   add!(C11, X1, C11);
 
@@ -326,5 +352,669 @@ function mul_strassen!(C::MatElem{T}, A::MatElem{T}, B::MatElem{T}) where {T}
   end
 end
 
+#
+# see https://arxiv.org/pdf/1901.00904.pdf
+# p2
+#
+#  A B  = I       0  * A 0 * I A^-1 B
+#  C D   C A^-1   I    0 X   0  I
+#
+# for X = D - C A^-1 B
+#
+# inv( I A; 0 I) = I -A; 0 I
+#
+# TODO:
+#    inplace
+#    pre-alloced tmp for the recursion
+#
+function inv_strassen(A)
+  if isodd(ncols(A)) || ncols(A) < 500
+    return inv(A)
+  end
+#  @show :strassen, ncols(A)
+  @assert nrows(A) == ncols(A)
+  @assert iseven(ncols(A))
+  n = div(ncols(A), 2)
+  A11 = view(A, 1:n, 1:n)
+  A12 = view(A, 1:n, n+1:2*n)
+  A21 = view(A, n+1:2*n, 1:n)
+  A22 = view(A, n+1:2*n, n+1:2*n)
+
+#  I = identity_matrix(base_ring(A), n)
+#  Z = zero_matrix(base_ring(A), n, n)
+
+
+  A11_i = inv_strassen(A11)
+  A11_i_A12 = A11_i*A12
+  delta = A22-A21*A11_i_A12
+  X22 = delta_i = inv_strassen(delta)
+  A21_A11_i = A21*A11_i
+  X21 = -delta_i*A21_A11_i
+  X12 = -A11_i_A12*delta_i
+  X11 = A11_i-A11_i_A12*X21
+
+  X = similar(A)
+  ccall((:nmod_mat_set, Hecke.libflint), Nothing, (Ref{fpMatrix}, Ref{fpMatrix}), view(X, 1:n, 1:n), X11)
+  ccall((:nmod_mat_set, Hecke.libflint), Nothing, (Ref{fpMatrix}, Ref{fpMatrix}), view(X, 1:n, n+1:n+n), X12)
+  ccall((:nmod_mat_set, Hecke.libflint), Nothing, (Ref{fpMatrix}, Ref{fpMatrix}), view(X, n+1:n+n, 1:n), X21)
+  ccall((:nmod_mat_set, Hecke.libflint), Nothing, (Ref{fpMatrix}, Ref{fpMatrix}), view(X, n+1:n+n, n+1:n+n), X22)
+
+  return X
+
+  return [X11 X12; X21 X22]
+end
+
+function solve_triu_left_strassen(b::MatElem, T::MatElem; cutoff::Int = cutoff)
+  #b*inv(T)
+  n = ncols(T)
+  if n <= 50
+    R = solve_triu_left(b, T)
+    return R
+  end
+
+  n2 = div(n, 2) + n % 2
+  m = nrows(b)
+  m2 = div(m, 2) + m % 2
+
+  U = view(b, 1:m2, 1:n2)
+  V = view(b, 1:m2, n2+1:n)
+  X = view(b, m2+1:m, 1:n2)
+  Y = view(b, m2+1:m, n2+1:n)
+
+  A = view(T, 1:n2, 1:n2)
+  B = view(T, 1:n2, 1+n2:n)
+  C = view(T, 1+n2:n, 1+n2:n)
+
+  S = solve_triu_left_strassen(U, A; cutoff)
+  R = solve_triu_left_strassen(X, A; cutoff)
+
+  SS = mul_strassen(S, B; cutoff)
+  sub!(SS, V, SS)
+  SS = solve_triu_left_strassen(SS, C; cutoff)
+
+  RR = mul_strassen(R, B; cutoff)
+  sub!(RR, Y, RR)
+  RR = solve_triu_left_strassen(RR, C; cutoff)
+
+  return [S SS; R RR]
+end
+
+function solve_triu_left(b::ZZMatrix, U::ZZMatrix)
+   n = ncols(U)
+   m = nrows(b)
+   R = base_ring(U)
+   X = zero(b)
+   tmp = zero_matrix(ZZ, 1, n)
+   t = R()
+   s = R()
+   for i = 1:m
+      tmp_p = Nemo.mat_entry_ptr(tmp, 1, 1)
+      X_p = Nemo.mat_entry_ptr(X, i, 1)
+      for j = 1:n
+         ccall((:fmpz_set, Nemo.libflint), Cvoid, (Ptr{fmpz}, Ptr{fmpz}), tmp_p, X_p)
+         X_p += sizeof(ZZRingElem)
+         tmp_p += sizeof(ZZRingElem)
+      end
+      for j = 1:n
+         ccall((:fmpz_zero, Nemo.libflint), Cvoid, (Ref{fmpz}, ), s) 
+
+         tmp_p = Nemo.mat_entry_ptr(tmp, 1, 1)
+         for k = 1:j-1
+            U_p = Nemo.mat_entry_ptr(U, k, j)
+            ccall((:fmpz_addmul, Nemo.libflint), Cvoid, (Ref{fmpz}, Ptr{fmpz}, Ptr{fmpz}), s, U_p, tmp_p)
+            tmp_p += sizeof(ZZRingElem)
+         end
+         ccall((:fmpz_sub, Nemo.libflint), Cvoid, 
+          (Ref{fmpz}, Ptr{fmpz}, Ref{fmpz}), s, Nemo.mat_entry_ptr(b, i, j), s)
+         ccall((:fmpz_divexact, Nemo.libflint), Cvoid, 
+          (Ptr{fmpz}, Ref{fmpz}, Ptr{fmpz}), Nemo.mat_entry_ptr(tmp, 1, j), s, Nemo.mat_entry_ptr(U, j, j))
+      end
+      tmp_p = Nemo.mat_entry_ptr(tmp, 1, 1)
+      X_p = Nemo.mat_entry_ptr(X, i, 1)
+      for j = 1:n
+         ccall((:fmpz_set, Nemo.libflint), Cvoid, (Ptr{fmpz}, Ptr{fmpz}), X_p, tmp_p)
+         X_p += sizeof(ZZRingElem)
+         tmp_p += sizeof(ZZRingElem)
+      end
+   end
+   return X
+end
+
+function Hecke.hadamard_bound2(M::ZZMatrix)
+  is_square(M) || error("Matrix must be square")
+  H = ZZ(1);
+  r = ZZ(0)
+  n = nrows(M)
+  for i in 1:n
+    ccall((:fmpz_set_si, Nemo.libflint), Cvoid, (Ref{ZZRingElem}, Int), r, 0)
+    M_ptr = Nemo.mat_entry_ptr(M, i, 1)
+    for j in 1:n
+      ccall((:fmpz_addmul, Nemo.libflint), Cvoid, (Ref{ZZRingElem}, Ptr{ZZRingElem}, Ptr{ZZRingElem}), r, M_ptr, M_ptr)
+      M_ptr += sizeof(ZZRingElem)
+    end
+    if iszero(r)
+      return r
+    end
+    mul!(H, H, r)
+  end
+  return H
+end
+
+function Base.maximum(::typeof(nbits), M::ZZMatrix)
+  mx = 0
+  n = nrows(M)
+  m = ncols(M)
+  M_ptr = Nemo.mat_entry_ptr(M, 1, 1)
+  for i in 1:n
+    for j in 1:m
+      if !iszero(unsafe_load(reinterpret(Ptr{Int}, M_ptr)))
+        mx = max(mx, ccall((:fmpz_bits, Nemo.libflint), Int, (Ptr{fmpz},), M_ptr))
+      end
+      M_ptr += sizeof(ZZRingElem)
+    end
+  end
+  return mx
+end
+
+
+# Triangular denominator: for a solution matrix "s", given denominator d and A= d*s.
+#  Uses presumably the following function
+## [17] gcdx(a::ZZRingElem, b::ZZRingElem)
+##     @ Nemo ~/OSCAR/GIT/Nemo.jl/src/flint/fmpz.jl:1293
+# NOTE: mod(,) seems to return least-non-negative remainder
+function hcol(A::ZZMatrix, d::ZZRingElem)  # why not Vector{ZZRingElem}???
+  n = nrows(A)
+  w = deepcopy(A)
+  g = d
+  t = Array(ZZ,n)
+  h = zero_matrix(ZZ,n,n)
+  s = 1
+  v = 1
+  for i = 1:n
+    k = n-(i-1)
+    gg,e,f = gcdx(g,A[k,1])
+    t[k] = f
+    h[k,k] = g/gg
+    g = gg
+  end
+  for i = 1:n
+    if h[i,i] == 1
+      continue
+    end
+    for k = 1:i-1
+      h[k,i] = mod(-t[i]*w[k,1], h[i,i])
+      w[k,1] = mod(w[k,1]+h[k,i]*w[i,1], d)   
+    end
+    w[i,1] = h[i,i]*w[i,1]
+    d = d/h[i,i]
+    divexact!(w, w, h[i,i])
+  end
+  return h
+end
+
+function infinity_norm(A::ZZMatrix)
+  return maximum(abs, A)
+end
+
+@inline function Nemo.mat_entry_ptr(b::fpMatrix, i::Int, j::Int)
+  unsafe_load(reinterpret(Ptr{Ptr{Clong}}, b.rows), i) + (j-1)*sizeof(Clong)
+end
+
+function induce_crt!(A::ZZMatrix, p::ZZRingElem, B::fpMatrix, ::Integer; signed::Bool = false)
+  #the second modulus is implicit in B: B.n
+
+  ccall((:fmpz_mat_CRT_ui, Nemo.libflint), Cvoid, (Ref{ZZMatrix}, Ref{ZZMatrix}, Ref{ZZRingElem}, Ref{fpMatrix}, Int), A, A, p, B, signed)
+  mul!(p, p, B.n)
+  return
+end
+
+# Storjohann's unimodular certification
+# We use CRT to obtain the modular inverse B0
+# The p-adic Hensel-lifting version (below) seems to be faster
+function UniCertZ_CRT(A::ZZMatrix)
+  n = nrows(A);
+  #assume ncols == n
+  H = hadamard_bound2(A)
+  EntrySize = infinity_norm(A)
+  e = max(16, Int(2+ceil(2*log2(n)+log2(EntrySize))));
+  println("Modulus has  $e bits");
+  
+  B0 = MatrixSpace(ZZ,n,n)(0); ## will be computed by crt in loop below
+  m = ZZ(1); p = 2^29;  # MAGIC NUMBER (initial prime, probably should be about 2^29?)
+  @time while (nbits(m) < e)
+    p = next_prime(p);
+    ZZmodP = GF(p, cached = false);
+    MatModP = map_entries(ZZmodP, A)
+    B00 = inv_strassen(MatModP)
+    induce_crt!(B0,m, B00,p, signed = true);  #  also updates:  m = m*p;
+  end
+  println("Got modular inverse");
+
+  # Compute max num iters in k
+  k = (n-1)*(log2(EntrySize)+log2(n)/2) - (2*log2(n) + log2(EntrySize));
+  k = k/log2(m);
+  k = 2+Int(ceil(log2(k)));
+  println("Max num iters is k=$(k)");
+
+  ZZmodM = residue_ring(ZZ,m);
+  B0_modm = map_entries(ZZmodM, B0);
+  I = MatrixSpace(ZZ,n,n)(1)
+  R = (I-A*B0)/m
+  if is_zero(R)
+    return true;
+  end
+  for i in 0:k-1
+    println("UniCertZ: loop: i=$(i)");
+    @time R_bar = R^2;
+    @time M = lift(B0_modm*map_entries(ZZmodM, R_bar));
+    Hecke.mod_sym!(M, m)
+    @time R = (R_bar - A*M)/m;
+    if is_zero(R)
+      return true
+    end
+  end #for
+  return false
+end
+
+function _det(a::fpMatrix)
+  a.r < 10 && return det(a)
+  #_det avoids a copy: det is computed destructively
+  r = ccall((:_nmod_mat_det, Nemo.libflint), UInt, (Ref{fpMatrix}, ), a)
+  return ZZ(r)
+end
+
+
+# Determinant algorithm: U is the range for random RHS matrix (default -100:100)
+# Answer is !!! CORRECT UP TO SIGN !!!
+function DetS(A::ZZMatrix, U::AbstractArray= -100:100)
+  n = ncols(A)
+  T = ZZMatrix[]
+  AT = A
+  d1 = ZZ(1) # will be the det
+
+  p = UInt(next_prime(2^60))
+  det_p = ZZ(0)
+  prod_p = ZZ(1)
+  Ainf = maximum(nbits, A)
+  Had = div(nbits(Hecke.hadamard_bound2(A)), 2)+1
+
+  #Dixon cost
+  # lifting has (at least) Had/60 many steps (as the numerator will be
+  # of maximal size)
+  # Each step uses (Ainf+60+n)/60 many modular products
+  #                +1 for lifting
+  #so that should be the number of quadratic steps
+  #
+  function dixon_cost(::fmpz)
+    ceil(Int, (((Ainf+60+n)/60 + 1) * Had/60)/n)
+  end
+  #this should estimate the number of modular dets that will cost
+  #as much as the next dixon step.
+  #if we are close enough to Had, we move to crt.
+  function crt_cost(d::fmpz)
+    return ceil(Int, (Had-nbits(d))/60)
+  end
+  function hnf_cost(d::fmpz)
+    ceil(Int, nbits(d)/60)
+  end
+  function uni_cost(d::fmpz)
+    ceil(Int, log(n)*(2*n+Ainf)/60)
+  end
+
+  f = true
+  local Ap::fpMatrix
+  @time while nbits(prod_p) < Ainf + n #+ 100
+    if f
+      f = false
+      Ap = map_entries(Nemo.fpField(p, false), A)
+    else
+      change_prime(Ap, p)
+      map_entries!(Ap, Nemo.fpField(p, false), A)
+    end
+    d = _det(Ap)
+    if det_p == 0
+      det_p = d
+    else
+      det_p = crt(det_p, prod_p, d, ZZ(p))
+    end
+    mul!(prod_p, prod_p, p)
+    p = next_prime(p)
+  end
+  det_p = mod_sym(det_p, prod_p)
+  det_small = false
+  if nbits(prod_p) - nbits(det_p) > 30
+    @show "det small", det_p
+    det_small = true
+  end
+
+  local D::DixonCtx
+  f = true
+  while !det_small
+    b = rand(matrix_space(ZZ,n,1),U); 
+    if f
+      D = dixon_init(A, b)
+      f = false
+    end
+    @time TS, d = dixon_solve(D, b)
+    for i=1:length(T)
+      TS = T[i]*TS
+    end
+    c = content(TS)
+    g = gcd(c, d)
+    divexact!(TS, TS, g)
+    d = divexact(d, g)
+    println("DetS: loop: nbits(d)=$(nbits(d))");
+    global last_bad = b, T, d
+    @assert !isone(d)  #should never happen, the det_p should catch that
+    isone(d) && break
+    d1 *= d
+    @show  (Had - nbits(d1) - nbits(prod_p))/60
+    if Had - nbits(d1) < nbits(prod_p)
+      @show "at H-bound",  (Had - nbits(d1) - nbits(prod_p))/60
+      return d1
+    end
+    if (Had - nbits(d1) - nbits(prod_p))/60 < dixon_cost(d)
+      @show "finishing with CRT"
+      while nbits(prod_p)+nbits(d1) < Had
+        change_prime(Ap, p)
+        map_entries!(Ap, Nemo.fpField(p, false), A)
+        d = _det(Ap)
+        det_p = crt(det_p, prod_p, d, ZZ(p))
+        mul!(prod_p, prod_p, ZZ(p))
+      end
+      mis = mod_sym(det_p*invmod(d1, prod_p), prod_p)
+      return d1*mis
+    end
+    @time T1 = hcol(TS, d)  
+    push!(T, T1)
+    @show :solve
+    @time AT = solve_triu_left_strassen(AT, T1)
+    @show nbits(prod_p), nbits(d1)
+    @show nbits(abs(mod_sym(invmod(d1, prod_p)*det_p, prod_p)))
+    if abs(mod_sym(invmod(d1, prod_p)*det_p, prod_p)) < ZZ(2)^100
+      break
+    end
+    @show nbits(d), Ainf
+    if nbits(d) < Ainf + n
+      h = hnf_modular_eldiv(AT, d)
+      d1 *= prod([h[i,i] for i=1:n])
+    @show Had / nbits(d1)  
+      AT = solve_triu_left_strassen(AT, h)
+      if abs(mod_sym(invmod(d1, prod_p)*det_p, prod_p)) < ZZ(2)^100
+        break
+      end
+    end
+  end
+  det_p = invmod(d1, prod_p)*det_p
+  @show det_p = mod_sym(det_p, prod_p)
+  @assert abs(det_p) < ZZ(2)^100
+  h = hnf_modular_eldiv(AT, det_p)
+  @show det(h) // det_p, det(h)
+  d1 *= det(h)
+  AT = solve_triu_left_strassen(AT, h)
+    println("DOING UNICERTZ");
+    @show uni_cost(d1)
+    @show crt_cost(d1)
+    #need to cost this as well
+    @time fl = UniCertZ_CRT(AT)
+    #           @assert fl == is_unit(det(ZAT));
+    if fl
+      return d1
+    end
+    return d1
+    error("unimod failed")
+end
+
+#adaptive rational_reconstruction: if solution is unbalanced with
+#denominator smaller than numerator
+function ratrec(a::ZZMatrix, b::ZZRingElem)
+  A = similar(a)
+  D = ZZ(1)
+  T = ZZ()
+  n = ZZ()
+  d = ZZ()
+  for i=1:nrows(a)
+    a_ptr = Nemo.mat_entry_ptr(a, i, 1)
+    A_ptr = Nemo.mat_entry_ptr(A, i, 1)
+    for j=1:ncols(a)
+#      @show i, j
+      ccall((:fmpz_set, Nemo.libflint), Cvoid, (Ref{fmpz}, Ptr{fmpz}), T, a_ptr)
+      mul!(T, T, D)
+      mod!(T, T, b)
+      fl = ratrec!(n, d, T, b)
+      fl || return fl, A, D
+      if !isone(d)
+        D = D*d
+        mul!(A, A, d)
+      end
+      ccall((:fmpz_set, Nemo.libflint), Cvoid, (Ptr{fmpz}, Ref{fmpz}), A_ptr, n)
+
+      a_ptr += sizeof(ZZRingElem)
+      A_ptr += sizeof(ZZRingElem)
+    end
+  end
+  return true, A, D
+end
+
+function Nemo.div!(a::ZZRingElem, b::ZZRingElem, c::ZZRingElem)
+  ccall((:fmpz_tdiv_q, Nemo.libflint), Cvoid, (Ref{ZZRingElem}, Ref{ZZRingElem}, Ref{ZZRingElem}), a, b, c)
+end
+
+function shift_right!(a::ZZRingElem, b::ZZRingElem, i::Int)
+  ccall((:fmpz_fdiv_q_2exp, Nemo.libflint), Cvoid, (Ref{fmpz}, Ref{fmpz}, Int), a, b, i)
+end
+
+function ratrec(a::ZZRingElem, b::ZZRingElem)
+  n = ZZ()
+  d = ZZ()
+  fl = ratrec!(n, d, a, b)
+  return fl, n, d
+end
+
+function ratrec!(n::ZZRingElem, d::ZZRingElem, a::ZZRingElem, b::ZZRingElem)
+  k = nbits(b)
+  l = 1
+  N = deepcopy(b)
+  D = ZZ(2)
+#  @show :start
+  @assert 0<a<b
+  done = false
+  while !done && D <= N
+    mul!(D, D, D)
+    div!(N, b, D)
+    shift_right!(N, N, 1)
+    if D>N
+      D = N = iroot(b, 2)
+      D = div(D, 2)
+      done = true
+    end
+#    @show N, D, b, 2*N*D
+    @assert 2*N*D < b
+#    @show a<N, nbits(a), nbits(N)
+    fl = ccall((:_fmpq_reconstruct_fmpz_2, Nemo.libflint), Bool, (Ref{fmpz}, Ref{fmpz}, Ref{fmpz}, Ref{fmpz}, Ref{fmpz}, Ref{fmpz}), n, d, a, b, N, D)
+#    @show fl, nbits(D), nbits(d), k - nbits(n)-nbits(d)
+#    @show n, d
+    if fl && (nbits(n)+nbits(d) < k - 30 || D>N)
+      return fl
+    end
+    l += 1
+  end
+  return false
+end
+
+function Hecke.map_entries(k::Nemo.fpField, A::ZZMatrix)
+  a = zero_matrix(k, nrows(A), ncols(A))
+  return map_entries!(a, k, A)
+end
+
+function map_entries!(a::fpMatrix, k::Nemo.fpField, A::ZZMatrix)
+  ccall((:fmpz_mat_get_nmod_mat, Nemo.libflint), Cvoid, (Ref{fpMatrix}, Ref{ZZMatrix}), a, A)
+  return a
+end
+
+function change_prime(a::fpMatrix, p::UInt)
+  a.n = p  #in flint 3.0 available as nmod_mat_set_mod
+  a.norm = leading_zeros(p)
+  a.ninv = ccall((:n_preinvert_limb_prenorm, Nemo.libflint), UInt, (UInt, ), p << a.norm)
+end
+
+function lift!(A::ZZMatrix, a::fpMatrix)
+  ccall((:fmpz_mat_set_nmod_mat, Nemo.libflint), Cvoid, (Ref{ZZMatrix}, Ref{fpMatrix}), A, a)
+end
+
+mutable struct DixonCtx
+  bound::ZZRingElem
+  Ainv::fpMatrix
+  p::UInt
+  crt_primes::Vector{UInt}
+  A_mod::Vector{fpMatrix}
+  d_mod::fpMatrix
+  y_mod::fpMatrix
+  Ay_mod::fpMatrix
+  Ay::ZZMatrix
+  x::ZZMatrix
+  d::ZZMatrix
+  A::ZZMatrix
+  function DixonCtx()
+    return new()
+  end
+end
+#copied from flint to allow the use of adaptive reconstruction,
+function dixon_init(A::ZZMatrix, B::ZZMatrix)
+  D = DixonCtx()
+  D.A = A
+
+  n = nrows(A)
+  cols = ncols(B)
+
+  _N = ZZ()
+  _D = ZZ()
+  ccall((:fmpz_mat_solve_bound, Nemo.libflint), Cvoid, (Ref{fmpz}, Ref{fmpz}, Ref{ZZMatrix}, Ref{ZZMatrix}), _N, _D, A, B)
+  Ainv = zero_matrix(GF(2, cached = false), n, n)
+  p = ccall((:fmpz_mat_find_good_prime_and_invert, Nemo.libflint), UInt, (Ref{fpMatrix}, Ref{ZZMatrix}, Ref{fmpz}), Ainv, A, _D)
+  @assert p != 0
+  D.p = p
+  D.Ainv = Ainv
+
+  D.bound = 2*maximum(abs, [_D, _N])^2 * 2^30
+  D.crt_primes = UInt[]
+  D.A_mod = fpMatrix[]
+ 
+  pr = ZZ(1)
+  xp = p
+  maxA = maximum(abs, A) *(p-1)*n*2
+
+  while true
+    push!(D.crt_primes, xp)
+    push!(D.A_mod, map_entries(Nemo.fpField(xp, false), A))
+    pr *= xp
+    if pr > maxA
+      break
+    end
+    xp = next_prime(xp)
+  end
+
+  k = Nemo.fpField(p, false)
+  D.d_mod = zero_matrix(k, nrows(B), ncols(B))
+  D.y_mod = zero_matrix(k, nrows(B), ncols(B))
+  D.Ay_mod = zero_matrix(k, nrows(B), ncols(B))
+  D.Ay = zero_matrix(ZZ, nrows(B), ncols(B))
+  D.x = zero_matrix(ZZ, nrows(B), ncols(B))
+
+  return D
+end
+
+function dixon_solve(D::DixonCtx, B::ZZMatrix)
+  zero!(D.x)
+  d = deepcopy(B)
+  ppow = ZZ(1)
+  i = 1
+  nexti = 1
+  while ppow <= D.bound
+    map_entries!(D.d_mod, Nemo.fpField(D.p, false), d)
+    mul!(D.y_mod, D.Ainv, D.d_mod)
+
+    ccall((:fmpz_mat_scalar_addmul_nmod_mat_fmpz, Nemo.libflint), Cvoid, (Ref{ZZMatrix}, Ref{fpMatrix}, Ref{fmpz}), D.x, D.y_mod, ppow)
+
+    mul!(ppow, ppow, D.p)
+    if ppow > D.bound
+      break
+    end
+
+    stabilised = i == nexti;
+
+    if stabilised
+      nexti = ceil(Int,(i*1.4)) + 1;
+      fl, num, den = ratrec(D.x, ppow)
+
+      if fl
+        # fl = (D.A*num == den*B)
+        sz = max(maximum(nbits, D.A) + maximum(nbits, num) + nbits(ncols(B)) + 1, 
+                 maximum(nbits, B) + nbits(den))
+        if sz < nbits(ppow)
+          @show "no prod neccessary"
+        else
+          xp = next_prime(D.p)
+          @show ceil(Int, (sz - nbits(ppow))/60)
+          for i=1:ceil(Int, (sz - nbits(ppow))/60)
+            k = Nemo.fpField(xp, false)
+            Ap = map_entries(k, D.A)
+            Bp = map_entries(k, B)
+            nump = map_entries(k, num)
+            fl = Ap*nump == Bp*k(den)
+            fl || break
+            xp = next_prime(xp)
+          end
+        end
+        @show fl
+        fl && return num, den
+      end
+    end
+  
+    i += 1
+#    i % 100 == 0 && @show i
+#    i % 1000 == 0 && return
+
+    prod = ZZ(1)
+    if false
+      mul!(D.Ay, D.A, lift(D.y_mod))
+    else
+      for j=1:length(D.crt_primes)
+        change_prime(D.y_mod, D.crt_primes[j])
+        change_prime(D.Ay_mod, D.crt_primes[j])
+
+        mul!(D.Ay_mod, D.A_mod[j], D.y_mod)
+        if j == 1
+          lift!(D.Ay, D.Ay_mod)
+          prod = ZZ(D.crt_primes[j])
+        else
+          induce_crt!(D.Ay, prod, D.Ay_mod, D.crt_primes[j], signed = true)
+#          ccall((:fmpz_mat_CRT_ui, Nemo.libflint), Cvoid, (Ref{ZZMatrix}, Ref{ZZMatrix}, Ref{fmpz}, Ref{fpMatrix}, Int), Ay, Ay, prod, Ay_mod, 1)
+#          prod *= crt_primes[j]
+        end
+      end
+      change_prime(D.y_mod, D.p)
+    end
+    sub!(d, d, D.Ay)
+    divexact!(d, d, ZZ(D.p))
+  end
+  fl, num, den = ratrec(D.x, ppow)
+
+  @assert fl
+#  @time @assert D.A*num == den*B
+  return num, den
+end
+
+
+#= example
+
+ A=rand(MatrixSpace(ZZ,10,10),1:10);
+ DetS(A, -10:10)
+
+ A=rand(MatrixSpace(ZZ,300,300),10:100);
+ @time DetS(A)
+
+=#
+
 
 end # module

From 4a8a28da435a41b596bb8dd383a60984769a66d9 Mon Sep 17 00:00:00 2001
From: Claus Fieker <fieker@mathematik.uni-kl.de>
Date: Mon, 27 Nov 2023 10:20:44 +0100
Subject: [PATCH 2/5] progress...

---
 examples/strassen.jl | 492 ++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 435 insertions(+), 57 deletions(-)

diff --git a/examples/strassen.jl b/examples/strassen.jl
index 6cd844d8ed..c065ddf410 100644
--- a/examples/strassen.jl
+++ b/examples/strassen.jl
@@ -16,10 +16,53 @@
 
 module Strassen
 using Hecke
+using LinearAlgebra, Profile, Base.Intrinsics
 import Hecke.AbstractAlgebra, Hecke.Nemo
+import Hecke.Nemo: add!, mul!, zero!, sub!, solve_triu!, solve_tril!
 
 const cutoff = 1500
 
+#creates an unimodular n x n matrix where the inverse is much larger
+#than the original matrix. Entries are chosen in U
+function rand_mat(n::Int, U::AbstractArray=1:10)
+  C = zero_matrix(ZZ, n, n)
+  for i=1:n-1
+    C[i, i+1] = 1
+  end
+  for i=2:n
+    C[n, i] = rand(U)
+  end
+  C[n,1] = 1
+  for i=2:n-1
+    C[i,i] = rand(U)
+  end
+  return C
+end
+
+function perturb!(A, n::Int = 100, c::AbstractRange = -10:10)
+  for i=1:n 
+    x = rand(c)
+    while iszero(x)
+      x = rand(c)
+    end
+    if rand(0:1) == 1
+      k = rand(1:nrows(A))
+      l = rand(1:nrows(A))
+      while l == k
+        l = rand(1:nrows(A))
+      end
+      A[k, :] += x*A[l, :]
+    else
+      k = rand(1:ncols(A))
+      l = rand(1:ncols(A))
+      while l == k
+        l = rand(1:ncols(A))
+      end
+      A[:, k] += x*A[:, l]
+    end
+  end
+end
+
 function Nemo.mul!(C::AbstractArray, A::AbstractArray, B::AbstractArray, add::Bool = false)
   @assert size(C) == (2,2) && size(A) == (2,2) && size(B) == (2,2)
   C[1,1] = A[1,1] * B[1,1] + A[1,2] * B[2,1]
@@ -45,7 +88,7 @@ function mul_strassen!(C::AbstractArray, A::AbstractArray, B::AbstractArray)
   @assert a == sC[1] && b == sB[1] && c == sC[2]
 
   if (a <= cutoff || b <= cutoff || c <= cutoff)
-      mul!(C, A, B)
+      Nemo.mul!(C, A, B)
       return
   end
 
@@ -201,21 +244,21 @@ function zero!(A::MatElem{T}) where {T}
   error("not done yet")
 end
 
-function zero!(A::fpMatrix)
-  ccall((:nmod_mat_zero, Nemo.libflint), Cvoid, (Ref{fpMatrix}, ), A)
-end
-
-function zero!(A::ZZMatrix)
-  ccall((:fmpz_mat_zero, Nemo.libflint), Cvoid, (Ref{ZZMatrix}, ), A)
-end
+#function zero!(A::fpMatrix)
+#  ccall((:nmod_mat_zero, Nemo.libflint), Cvoid, (Ref{fpMatrix}, ), A)
+#end
 
-function sub!(A::ZZMatrix, B::ZZMatrix, C::ZZMatrix)
-  ccall((:fmpz_mat_sub, Nemo.libflint), Cvoid, (Ref{ZZMatrix}, Ref{ZZMatrix}, Ref{ZZMatrix}), A, B, C)
-end
+#function zero!(A::ZZMatrix)
+#  ccall((:fmpz_mat_zero, Nemo.libflint), Cvoid, (Ref{ZZMatrix}, ), A)
+#end
+#
+#function sub!(A::ZZMatrix, B::ZZMatrix, C::ZZMatrix)
+#  ccall((:fmpz_mat_sub, Nemo.libflint), Cvoid, (Ref{ZZMatrix}, Ref{ZZMatrix}, Ref{ZZMatrix}), A, B, C)
+#end
 
-function sub!(A::fpMatrix, B::fpMatrix, C::fpMatrix)
-  ccall((:fmpz_mat_sub, Nemo.libflint), Cvoid, (Ref{fpMatrix}, Ref{fpMatrix}, Ref{fpMatrix}), A, B, C)
-end
+#function sub!(A::fpMatrix, B::fpMatrix, C::fpMatrix)
+#  ccall((:fmpz_mat_sub, Nemo.libflint), Cvoid, (Ref{fpMatrix}, Ref{fpMatrix}, Ref{fpMatrix}), A, B, C)
+#end
 
 function mul_strassen(A::MatElem{T}, B::MatElem{T}; cutoff::Int = cutoff) where {T}
   C = zero_matrix(base_ring(A), nrows(A), ncols(B))
@@ -234,7 +277,7 @@ function mul_strassen!(C::MatElem{T}, A::MatElem{T}, B::MatElem{T}; cutoff::Int
   @assert a == sC[1] && b == sB[1] && c == sC[2]
 
   if (a <= cutoff || b <= cutoff || c <= cutoff)
-      mul!(C, A, B)
+      Nemo.mul!(C, A, B)
       return
   end
 
@@ -328,7 +371,7 @@ function mul_strassen!(C::MatElem{T}, A::MatElem{T}, B::MatElem{T}; cutoff::Int
       #nmod_mat_window_init(Cc, C, 0, 2*bnc, a, c);
       Cc = view(C, 1:a, 2*bnc+1:c)
       #nmod_mat_mul(Cc, A, Bc);
-      mul!(Cc, A, Bc)
+      Nemo.mul!(Cc, A, Bc)
   end
 
   if a > 2*anr #last row of A by B -> last row of C
@@ -337,7 +380,7 @@ function mul_strassen!(C::MatElem{T}, A::MatElem{T}, B::MatElem{T}; cutoff::Int
       #nmod_mat_window_init(Cr, C, 2*anr, 0, a, c);
       Cr = view(C, 2*anr+1:a, 1:c)
       #nmod_mat_mul(Cr, Ar, B);
-      mul!(Cr, Ar, B)
+     Nemo.mul!(Cr, Ar, B)
   end
 
   if b > 2*anc # last col of A by last row of B -> C
@@ -348,8 +391,103 @@ function mul_strassen!(C::MatElem{T}, A::MatElem{T}, B::MatElem{T}; cutoff::Int
       #nmod_mat_window_init(Cb, C, 0, 0, 2*anr, 2*bnc);
       Cb = view(C, 1:2*anr, 1:2*bnc)
       #nmod_mat_addmul(Cb, Cb, Ac, Br);
-      mul!(Cb, Ac, Br, true)
+      Nemo.mul!(Cb, Ac, Br, true)
+  end
+end
+
+function apply!(A::fpMatrix, P::Perm{Int}; offset::Int = 0)
+  n = length(P.d)
+  t = zeros(Int, n-offset)
+  for i=1:n-offset
+    t[i] = unsafe_load(reinterpret(Ptr{Int}, A.rows), P.d[i] + offset)
+  end
+  for i=1:n-offset
+    unsafe_store!(reinterpret(Ptr{Int}, A.rows), t[i], i + offset)
+  end
+end
+
+function apply!(Q::Perm{Int}, P::Perm{Int}; offset::Int = 0)
+  n = length(P.d)
+  t = zeros(Int, n-offset)
+  for i=1:n-offset
+    t[i] = Q.d[P.d[i] + offset]
+  end
+  for i=1:n-offset
+    Q.d[i + offset] = t[i]
+  end
+end
+
+#takes the result of lu! and turns it into a proper lu
+function Nemo.lu(P::Perm, U, r::Int)
+  m = nrows(U)
+  n = ncols(U)
+  R = base_ring(U)
+  L = similar(U, m, m)
+
+  for i = 1:m
+    for j = 1:n
+      if i > j
+        L[i, j] = U[i, j]
+        U[i, j] = R(0)
+      elseif i == j
+        L[i, j] = R(1)
+      elseif j <= m
+        L[i, j] = R(0)
+      end
+    end
+  end
+  return r, P, L, U
+end
+
+function lu!_strassen(P::Perm{Int}, Ai; cutoff::Int = 300)
+  m = nrows(A)
+
+  @assert length(P.d) == m
+  n = ncols(A)
+  if n < cutoff
+    return lu!(P, A)
+  end
+  n1 = div(n, 2)
+  for i=1:m
+    P.d[i] = i
+  end
+  P1 = Hecke.AbstractAlgebra.Perm(m)
+  A0 = view(A, 1:m, 1:n1)
+  r1 = lu!_strassen(P1, A0; cutoff)
+  @assert r1 == n1
+  if r1 > 0 
+    apply!(A, P1)
+    apply!(P, P1)
   end
+
+  A00 = view(A, 1:r1, 1:r1)
+  A10 = view(A, r1+1:m, 1:r1)
+  A01 = view(A, 1:r1, n1+1:n)
+  A11 = view(A, r1+1:m, n1+1:n)
+
+  if r1 > 0
+    #Note: A00 is a view of A0 thus a view of A
+    # A0 is lu!, thus implicitly two triangular matrices giving the
+    # lu decomosition. solve_tril! looks ONLY at the lower part of A00
+    Nemo.solve_tril!(A01, A00, A01, 1)
+    X = A10 * A01
+    Nemo.sub!(A11, A11, X)
+  end
+
+  P1 = Perm(nrows(A11))
+  r2 = lu!_strassen(P1, A11)
+  apply!(A, P1, offset = r1)
+  apply!(P, P1, offset = r1)
+
+  if (r1 != n1)
+    for i=1:m-r1
+      for j=1:min(i, r2)
+        A[r1+i-1, r1+j-1] = A[r1+i-1, n1+j-1]
+        A[r1+i-1, n1+j-1] = 0
+      end
+    end
+  end
+  return r1 + r2
 end
 
 #
@@ -368,7 +506,8 @@ end
 #    pre-alloced tmp for the recursion
 #
 function inv_strassen(A)
-  if isodd(ncols(A)) || ncols(A) < 500
+#  @show size(A)
+  if isodd(ncols(A)) || ncols(A) < 400
     return inv(A)
   end
 #  @show :strassen, ncols(A)
@@ -404,11 +543,11 @@ function inv_strassen(A)
   return [X11 X12; X21 X22]
 end
 
-function solve_triu_left_strassen(b::MatElem, T::MatElem; cutoff::Int = cutoff)
-  #b*inv(T)
+function solve_triu_strassen(T::MatElem, b::MatElem; cutoff::Int = cutoff)
+  #b*inv(T), thus solves Tx = b for T upper triangular
   n = ncols(T)
   if n <= 50
-    R = solve_triu_left(b, T)
+    R = solve_triu(T, b)
     return R
   end
 
@@ -425,21 +564,21 @@ function solve_triu_left_strassen(b::MatElem, T::MatElem; cutoff::Int = cutoff)
   B = view(T, 1:n2, 1+n2:n)
   C = view(T, 1+n2:n, 1+n2:n)
 
-  S = solve_triu_left_strassen(U, A; cutoff)
-  R = solve_triu_left_strassen(X, A; cutoff)
+  S = solve_triu_strassen(A, U; cutoff)
+  R = solve_triu_strassen(A, X; cutoff)
 
   SS = mul_strassen(S, B; cutoff)
   sub!(SS, V, SS)
-  SS = solve_triu_left_strassen(SS, C; cutoff)
+  SS = solve_triu_strassen(C, SS; cutoff)
 
   RR = mul_strassen(R, B; cutoff)
   sub!(RR, Y, RR)
-  RR = solve_triu_left_strassen(RR, C; cutoff)
+  RR = solve_triu_strassen(C, RR; cutoff)
 
   return [S SS; R RR]
 end
 
-function solve_triu_left(b::ZZMatrix, U::ZZMatrix)
+function AbstractAlgebra.solve_triu(U::ZZMatrix, b::ZZMatrix)
    n = ncols(U)
    m = nrows(b)
    R = base_ring(U)
@@ -495,7 +634,7 @@ function Hecke.hadamard_bound2(M::ZZMatrix)
     if iszero(r)
       return r
     end
-    mul!(H, H, r)
+    Nemo.mul!(H, H, r)
   end
   return H
 end
@@ -557,17 +696,253 @@ function infinity_norm(A::ZZMatrix)
 end
 
 @inline function Nemo.mat_entry_ptr(b::fpMatrix, i::Int, j::Int)
-  unsafe_load(reinterpret(Ptr{Ptr{Clong}}, b.rows), i) + (j-1)*sizeof(Clong)
+  unsafe_load(reinterpret(Ptr{Ptr{Culong}}, b.rows), i) + (j-1)*sizeof(Culong)
 end
 
 function induce_crt!(A::ZZMatrix, p::ZZRingElem, B::fpMatrix, ::Integer; signed::Bool = false)
   #the second modulus is implicit in B: B.n
 
   ccall((:fmpz_mat_CRT_ui, Nemo.libflint), Cvoid, (Ref{ZZMatrix}, Ref{ZZMatrix}, Ref{ZZRingElem}, Ref{fpMatrix}, Int), A, A, p, B, signed)
-  mul!(p, p, B.n)
+  Nemo.mul!(p, p, B.n)
   return
 end
 
+function vector_int(a::fpMatrix)
+  b = zeros(Int, nrows(a) * ncols(a))
+  vector_int!(b, a)
+  return b
+end
+
+function vector_int!(a::Vector{Int}, b::fpMatrix)
+  ccall(:memcpy, Cvoid, (Ref{Int}, Ptr{Clong}, Int), a, b.entries, nrows(b)*ncols(b) * 8)
+end
+
+function _map_entries(k::Nemo.fpField, A::ZZMatrix)
+  #turns A into a fpMatrix with julias memory manager,
+  #create a "view" where the view_parent is a julia-Uint array with the
+  #entries
+  # => no finaliser
+  # => matrix(entries) can be used as a 1-dim julia array
+  #    (for broadcast)
+  u = zeros(Int, nrows(A)*ncols(A))
+  a = Nemo.fpMatrix()
+  a.base_ring = k
+  a.entries = reinterpret(Ptr{Cvoid}, pointer(u))
+  v = zeros(UInt, nrows(A))
+  for i=1:nrows(A)
+    v[i] = a.entries + (i-1)*ncols(A)*8
+  end
+  a.view_parent = (u, v)
+  a.rows = reinterpret(Ptr{Cvoid}, pointer(v))
+  a.r = nrows(A)
+  a.c = ncols(A)
+  change_prime(a, UInt(characteristic(k)))
+  map_entries!(a, k, A)
+  return a
+end
+
+@inline myround(a, p, pi) = @fastmath a-p*Base.rint_llvm(a*pi)
+
+function Nemo.mod!(A::AbstractArray{Float64}, p::Int)
+  return Nemo.mod!(A, Float64(p), 1.0/Float64(p))
+end
+
+function Nemo.mod!(A::AbstractArray{Float64}, pf::Float64, pfi::Float64)
+#  pf = Float64(p)
+#  pfi = 1/pf
+#  A = reshape(A, nrows(A)*ncols(A))
+#  A .= (x->myround(x, pf, pfi)).(A) 
+#  return
+  Ap = pointer(A)
+  @fastmath for i= 1:length(A)
+    a = Base.Intrinsics.pointerref(Ap, 1, 1)
+#    a = unsafe_load(Ap)
+    a -= pf*Base.rint_llvm(a*pfi)
+#     a = Base.modf(a, pf)
+    Base.Intrinsics.pointerset(Ap, a, 1, 1)
+#    unsafe_store!(Ap, a)
+    Ap += sizeof(Float64)
+  end
+end
+
+# Input: B vector of matrices as Double64 meant to be mod q[i]
+# Output: RE vector of matrices ...                       p[i]
+
+function change_rns!(RE::Vector{Matrix{Float64}}, p::Vector{Int}, q::Vector{Int}, B::Vector{<:AbstractArray{Float64}}, U::Vector{Matrix{Float64}} = []) 
+  T = Float64
+  bd = Int[]  # mixed radix rep (wrt to q) of (prod(q)-1)/2
+  pr = mapreduce(ZZ, *, q)
+  pr = div(pr, 2)
+  for i = q
+    push!(bd, pr % i)
+    pr = divexact(pr - bd[end], i)
+  end
+
+  qinv = Int[0]
+  for i=2:length(B)
+    push!(qinv, invmod(Int(mapreduce(ZZ, *, q[1:i-1]) % q[i]), q[i]))
+  end
+  cp = [x-Int(mapreduce(ZZ, *, q) % x) for x = p]
+
+  if length(U) < 1
+    push!(U, copy(B[1]))
+  else
+    U[1] .= B[1]
+  end
+  @fastmath for i=2:length(B)
+    if length(U) < i
+      push!(U, copy(U[i-1]))
+    else
+      U[i] .= U[i-1]
+    end
+    for j=i-2:-1:1
+      BLAS.scal!((q[j] % T(q[i])), U[i])
+      BLAS.axpy!(1.0, U[j], U[i])
+      mod!(U[i], q[i])
+    end
+    BLAS.scal!(T((qinv[i]*(q[i]-1)) % q[i]), U[i])
+    BLAS.axpy!(T(qinv[i]), B[i], U[i])
+    mod!(U[i], q[i])
+  end
+
+  for j = 1:length(p)
+    if length(RE) < j
+      push!(RE, copy(U[length(B)]))
+    else
+      RE[j] .= U[length(B)]
+    end
+    mod!(RE[j], p[j])
+    @fastmath for i=length(B)-1:-1:1
+      BLAS.scal!(T(q[i] % p[j]), RE[j])
+      BLAS.axpy!(1.0, U[i], RE[j])
+      mod!(RE[j], p[j])
+    end
+  end
+
+  for i=1:length(RE[1])
+    for j=1:length(B)
+      if U[j][i] > bd[j]
+        for k=1:length(p)
+          RE[k][i] = (RE[k][i] + cp[k]) % p[k]
+        end
+        break
+      elseif U[j][i] < bd[j]
+        break
+      end
+    end
+  end
+  return RE
+end
+
+# Storjohann's unimodular certification
+# We use CRT to obtain the modular inverse B0
+# The p-adic Hensel-lifting version (below) seems to be faster
+function UniCertZ_CRT_rns(A::ZZMatrix)
+  n = nrows(A);
+  #assume ncols == n
+  H = hadamard_bound2(A)
+  EntrySize = maximum(nbits, A)
+  e = max(16, 2+ceil(Int, 2*log2(n)+EntrySize))
+  println("Modulus X has  $e bits");
+  
+  B0 = Matrix{Float64}[]
+  m = ZZ(1); p = 2^21;  # MAGIC NUMBER (initial prime, probably should be about 2^29?)
+  Xp = Int[]
+  @time while nbits(m) < e
+    p = next_prime(p);
+    push!(Xp, p)
+    ZZmodP = GF(p, cached = false);
+    MatModP = map_entries(ZZmodP, A)
+    B00 = inv_strassen(MatModP)
+    push!(B0, map(Float64, lift(B00)).entries)
+    Nemo.mul!(m, m, p)
+  end
+  println("Got modular inverses");
+  #m is the X
+
+  Ap = Matrix{Float64}[]
+  m_inv = fpFieldElem[]
+  Y = ZZ(1)
+  Yp = Int[]
+  while nbits(Y) < Int(1+ceil(log2(n)+EntrySize)) 
+    p = next_prime(p)
+    push!(Yp, p)
+    k = GF(p, cached = false)
+    push!(Ap, map(Float64, lift(map_entries(k, A))).entries)
+    push!(m_inv, inv(k(m)))
+    Nemo.mul!(Y, Y, p)
+  end
+
+  # Compute max num iters in k
+  k = (n-1)*(EntrySize+log2(n)/2) - (2*log2(n) + EntrySize);
+  k = k/log2(m);
+  k = 2+ceil(Int, log2(k));
+  println("Max num iters is k=$(k)");
+
+  t = similar(B0[1])
+  RY = Matrix{Float64}[]
+  U = Matrix{Float64}[]
+  change_rns!(RY, Yp, Xp, B0, U)
+  for i = 1:length(Ap)
+    x = Ap[i]
+    t = RY[i]
+    t .= -x*t
+    mod!(t, Yp[i])
+    for j=1:n
+      t[j,j] += 1.0
+    end
+    BLAS.scal!(Float64(lift(m_inv[i])), t)
+    mod!(t, Yp[i])
+  end
+
+  if all(iszero, RY)
+    return true
+  end
+
+  RX = map(similar, B0)
+  MY = map(similar, Ap)
+  R = similar(B0[1])
+  M = similar(B0[1])
+  T = similar(M)
+
+  MX = Matrix{Float64}[]
+
+  for i in 0:k-1
+    println("UniCertZ: loop: i=$(i)");
+    @time change_rns!(RX, Xp, Yp, RY, U)
+    @time for t = 1:length(RX)
+      BLAS.gemm!('n', 'n', 1.0, RX[t], RX[t], 0.0, M)
+      Nemo.mod!(M, Xp[t])
+      if length(MX) < t
+        push!(MX, B0[t] * M)
+      else
+        BLAS.gemm!('n', 'n', 1.0, B0[t], M, 0.0, MX[t])
+      end
+      Nemo.mod!(MX[t], Xp[t])
+    end
+    @time change_rns!(MY, Yp, Xp, MX, U)
+    @time for k=1:length(Ap)
+      S = RY[k]
+      M = similar(S)
+      BLAS.gemm!('n', 'n', 1.0, S, S, 0.0, M)
+#      change_rns2!(M, Yp[k], Xp, MX, U)
+#      S .= S*S
+      BLAS.gemm!('n', 'n', -1.0, Ap[k], MY[k], 1.0, M)
+#      S .-= Ap[k] * M
+      mod!(M, Yp[k])
+      BLAS.scal!(Float64(lift(m_inv[k])), M)
+      mod!(M, Yp[k])
+      RY[k], M = M, RY[k]
+    end
+
+    if all(iszero, RY)
+      return true
+    end
+  end #for
+  return false
+end
+
+
 # Storjohann's unimodular certification
 # We use CRT to obtain the modular inverse B0
 # The p-adic Hensel-lifting version (below) seems to be faster
@@ -663,7 +1038,8 @@ function DetS(A::ZZMatrix, U::AbstractArray= -100:100)
 
   f = true
   local Ap::fpMatrix
-  @time while nbits(prod_p) < Ainf + n #+ 100
+  small = Ainf+n
+  @time while nbits(prod_p) < small + 20
     if f
       f = false
       Ap = map_entries(Nemo.fpField(p, false), A)
@@ -677,7 +1053,7 @@ function DetS(A::ZZMatrix, U::AbstractArray= -100:100)
     else
       det_p = crt(det_p, prod_p, d, ZZ(p))
     end
-    mul!(prod_p, prod_p, p)
+    Nemo.mul!(prod_p, prod_p, p)
     p = next_prime(p)
   end
   det_p = mod_sym(det_p, prod_p)
@@ -720,7 +1096,7 @@ function DetS(A::ZZMatrix, U::AbstractArray= -100:100)
         map_entries!(Ap, Nemo.fpField(p, false), A)
         d = _det(Ap)
         det_p = crt(det_p, prod_p, d, ZZ(p))
-        mul!(prod_p, prod_p, ZZ(p))
+        Nemo.mul!(prod_p, prod_p, ZZ(p))
       end
       mis = mod_sym(det_p*invmod(d1, prod_p), prod_p)
       return d1*mis
@@ -728,34 +1104,37 @@ function DetS(A::ZZMatrix, U::AbstractArray= -100:100)
     @time T1 = hcol(TS, d)  
     push!(T, T1)
     @show :solve
-    @time AT = solve_triu_left_strassen(AT, T1)
+    @time AT = solve_triu_strassen(T1, AT)
     @show nbits(prod_p), nbits(d1)
     @show nbits(abs(mod_sym(invmod(d1, prod_p)*det_p, prod_p)))
-    if abs(mod_sym(invmod(d1, prod_p)*det_p, prod_p)) < ZZ(2)^100
+    if nbits(abs(mod_sym(invmod(d1, prod_p)*det_p, prod_p))) < small 
       break
     end
     @show nbits(d), Ainf
     if nbits(d) < Ainf + n
+      @show :doin_hnf
+      global last_hnf = (AT, d)
       h = hnf_modular_eldiv(AT, d)
       d1 *= prod([h[i,i] for i=1:n])
     @show Had / nbits(d1)  
-      AT = solve_triu_left_strassen(AT, h)
-      if abs(mod_sym(invmod(d1, prod_p)*det_p, prod_p)) < ZZ(2)^100
+      AT = solve_triu_strassen(h, AT)
+      if nbits(abs(mod_sym(invmod(d1, prod_p)*det_p, prod_p))) < small
         break
       end
     end
   end
   det_p = invmod(d1, prod_p)*det_p
   @show det_p = mod_sym(det_p, prod_p)
-  @assert abs(det_p) < ZZ(2)^100
+  @assert nbits(abs(det_p)) < small
   h = hnf_modular_eldiv(AT, det_p)
   @show det(h) // det_p, det(h)
   d1 *= det(h)
-  AT = solve_triu_left_strassen(AT, h)
+  AT = solve_triu_strassen(h, AT)
     println("DOING UNICERTZ");
     @show uni_cost(d1)
     @show crt_cost(d1)
     #need to cost this as well
+    global last_AT = AT
     @time fl = UniCertZ_CRT(AT)
     #           @assert fl == is_unit(det(ZAT));
     if fl
@@ -779,13 +1158,13 @@ function ratrec(a::ZZMatrix, b::ZZRingElem)
     for j=1:ncols(a)
 #      @show i, j
       ccall((:fmpz_set, Nemo.libflint), Cvoid, (Ref{fmpz}, Ptr{fmpz}), T, a_ptr)
-      mul!(T, T, D)
-      mod!(T, T, b)
+      Nemo.mul!(T, T, D)
+      Nemo.mod!(T, T, b)
       fl = ratrec!(n, d, T, b)
       fl || return fl, A, D
       if !isone(d)
         D = D*d
-        mul!(A, A, d)
+        Nemo.mul!(A, A, d)
       end
       ccall((:fmpz_set, Nemo.libflint), Cvoid, (Ptr{fmpz}, Ref{fmpz}), A_ptr, n)
 
@@ -816,11 +1195,11 @@ function ratrec!(n::ZZRingElem, d::ZZRingElem, a::ZZRingElem, b::ZZRingElem)
   l = 1
   N = deepcopy(b)
   D = ZZ(2)
-#  @show :start
-  @assert 0<a<b
+
+#  @assert 0<a<b
   done = false
   while !done && D <= N
-    mul!(D, D, D)
+    Nemo.mul!(D, D, D)
     div!(N, b, D)
     shift_right!(N, N, 1)
     if D>N
@@ -828,12 +1207,11 @@ function ratrec!(n::ZZRingElem, d::ZZRingElem, a::ZZRingElem, b::ZZRingElem)
       D = div(D, 2)
       done = true
     end
-#    @show N, D, b, 2*N*D
-    @assert 2*N*D < b
-#    @show a<N, nbits(a), nbits(N)
+
+#    @assert 2*N*D < b
+
     fl = ccall((:_fmpq_reconstruct_fmpz_2, Nemo.libflint), Bool, (Ref{fmpz}, Ref{fmpz}, Ref{fmpz}, Ref{fmpz}, Ref{fmpz}, Ref{fmpz}), n, d, a, b, N, D)
-#    @show fl, nbits(D), nbits(d), k - nbits(n)-nbits(d)
-#    @show n, d
+
     if fl && (nbits(n)+nbits(d) < k - 30 || D>N)
       return fl
     end
@@ -880,6 +1258,7 @@ mutable struct DixonCtx
   end
 end
 #copied from flint to allow the use of adaptive reconstruction,
+#support cases with small primes and Float64
 function dixon_init(A::ZZMatrix, B::ZZMatrix)
   D = DixonCtx()
   D.A = A
@@ -932,11 +1311,11 @@ function dixon_solve(D::DixonCtx, B::ZZMatrix)
   nexti = 1
   while ppow <= D.bound
     map_entries!(D.d_mod, Nemo.fpField(D.p, false), d)
-    mul!(D.y_mod, D.Ainv, D.d_mod)
+    Nemo.mul!(D.y_mod, D.Ainv, D.d_mod)
 
     ccall((:fmpz_mat_scalar_addmul_nmod_mat_fmpz, Nemo.libflint), Cvoid, (Ref{ZZMatrix}, Ref{fpMatrix}, Ref{fmpz}), D.x, D.y_mod, ppow)
 
-    mul!(ppow, ppow, D.p)
+    Nemo.mul!(ppow, ppow, D.p)
     if ppow > D.bound
       break
     end
@@ -949,7 +1328,8 @@ function dixon_solve(D::DixonCtx, B::ZZMatrix)
 
       if fl
         # fl = (D.A*num == den*B)
-        sz = max(maximum(nbits, D.A) + maximum(nbits, num) + nbits(ncols(B)) + 1, 
+        sz = max(maximum(nbits, D.A) + maximum(nbits, num) 
+                                     + nbits(ncols(B)) + 1, 
                  maximum(nbits, B) + nbits(den))
         if sz < nbits(ppow)
           @show "no prod neccessary"
@@ -977,20 +1357,18 @@ function dixon_solve(D::DixonCtx, B::ZZMatrix)
 
     prod = ZZ(1)
     if false
-      mul!(D.Ay, D.A, lift(D.y_mod))
+      Nemo.mul!(D.Ay, D.A, lift(D.y_mod))
     else
       for j=1:length(D.crt_primes)
         change_prime(D.y_mod, D.crt_primes[j])
         change_prime(D.Ay_mod, D.crt_primes[j])
 
-        mul!(D.Ay_mod, D.A_mod[j], D.y_mod)
+        Nemo.mul!(D.Ay_mod, D.A_mod[j], D.y_mod)
         if j == 1
           lift!(D.Ay, D.Ay_mod)
           prod = ZZ(D.crt_primes[j])
         else
           induce_crt!(D.Ay, prod, D.Ay_mod, D.crt_primes[j], signed = true)
-#          ccall((:fmpz_mat_CRT_ui, Nemo.libflint), Cvoid, (Ref{ZZMatrix}, Ref{ZZMatrix}, Ref{fmpz}, Ref{fpMatrix}, Int), Ay, Ay, prod, Ay_mod, 1)
-#          prod *= crt_primes[j]
         end
       end
       change_prime(D.y_mod, D.p)

From 18c256fac3d853107ee1f06a8bab8e0865a19849 Mon Sep 17 00:00:00 2001
From: Claus Fieker <fieker@mathematik.uni-kl.de>
Date: Mon, 27 Nov 2023 12:01:30 +0100
Subject: [PATCH 3/5] updates

---
 Project.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Project.toml b/Project.toml
index 7dd25acabe..c200bf474b 100644
--- a/Project.toml
+++ b/Project.toml
@@ -28,7 +28,7 @@ GAPExt = "GAP"
 PolymakeExt = "Polymake"
 
 [compat]
-AbstractAlgebra = "^0.33.0"
+AbstractAlgebra = "0.34"
 Dates = "1.6"
 Distributed = "1.6"
 GAP = "0.9.6, 0.10"
@@ -37,7 +37,7 @@ LazyArtifacts = "1.6"
 Libdl = "1.6"
 LinearAlgebra = "1.6"
 Markdown = "1.6"
-Nemo = "^0.37.4"
+Nemo = "0.38"
 Pkg = "1.6"
 Polymake = "0.10, 0.11"
 Printf = "1.6"

From c1d1e2c24d3cb6e742684cb7b375c54436933e18 Mon Sep 17 00:00:00 2001
From: Claus Fieker <fieker@mathematik.uni-kl.de>
Date: Thu, 30 Nov 2023 17:37:28 +0100
Subject: [PATCH 4/5] sort strassen:

  move useful generic bits to AA
  useful special bits to Nemo
  Det code to Det.jl
---
 examples/Det.jl      |  895 +++++++++++++++++++++++++++++++
 examples/strassen.jl | 1203 +-----------------------------------------
 2 files changed, 898 insertions(+), 1200 deletions(-)
 create mode 100644 examples/Det.jl

diff --git a/examples/Det.jl b/examples/Det.jl
new file mode 100644
index 0000000000..8eb58d047f
--- /dev/null
+++ b/examples/Det.jl
@@ -0,0 +1,895 @@
+#=
+    Copyright (C) 2023, Claus Fieker, John Abbott
+=#
+
+module Det
+using Hecke
+import AbstractAlgebra, Nemo
+using LinearAlgebra, Profile, Base.Intrinsics
+import Nemo: add!, mul!, zero!, sub!, solve_triu!, solve_tril!
+
+#creates an unimodular n x n matrix where the inverse is much larger
+#than the original matrix. Entries are chosen in U
+function rand_mat(n::Int, U::AbstractArray=1:10)
+  C = zero_matrix(ZZ, n, n)
+  for i=1:n-1
+    C[i, i+1] = 1
+  end
+  for i=2:n
+    C[n, i] = rand(U)
+  end
+  C[n,1] = 1
+  for i=2:n-1
+    C[i,i] = rand(U)
+  end
+  return C
+end
+
+#applies n unimodular transformations (add scaled row/col)
+function perturb!(A, n::Int = 100, c::AbstractRange = -10:10)
+  for i=1:n 
+    x = rand(c)
+    while iszero(x)
+      x = rand(c)
+    end
+    if rand(0:1) == 1
+      k = rand(1:nrows(A))
+      l = rand(1:nrows(A))
+      while l == k
+        l = rand(1:nrows(A))
+      end
+      AbstractAlgebra.add_row!(A, x, k, l)
+    else
+      k = rand(1:ncols(A))
+      l = rand(1:ncols(A))
+      while l == k
+        l = rand(1:ncols(A))
+      end
+      AbstractAlgebra.add_column!(A, x, k, l)
+    end
+  end
+end
+
+#takes the result of lu! and turns it into a proper lu
+function Nemo.lu(P::Perm, U, r::Int)
+  m = nrows(U)
+  n = ncols(U)
+  R = base_ring(U)
+  L = similar(U, m, m)
+
+  for i = 1:m
+    for j = 1:n
+      if i > j
+        L[i, j] = U[i, j]
+        U[i, j] = R(0)
+      elseif i == j
+        L[i, j] = R(1)
+      elseif j <= m
+        L[i, j] = R(0)
+      end
+    end
+  end
+  return r, P, L, U
+end
+
+#
+# see https://arxiv.org/pdf/1901.00904.pdf
+# p2
+#
+#  A B  = I       0  * A 0 * I A^-1 B
+#  C D   C A^-1   I    0 X   0  I
+#
+# for X = D - C A^-1 B
+#
+# inv( I A; 0 I) = I -A; 0 I
+#
+# TODO:
+#    inplace
+#    pre-alloced tmp for the recursion
+#
+# only worksing iff A is invertble. In this case magically X is also invertable
+# in general need to use lu (lu!_strassen)
+# FAST when applicable
+function inv_strassen(A::fpMatrix)
+#  @show size(A)
+  if isodd(ncols(A)) || ncols(A) < 400
+    return inv(A)
+  end
+#  @show :strassen, ncols(A)
+  @assert nrows(A) == ncols(A)
+  @assert iseven(ncols(A))
+  n = div(ncols(A), 2)
+  A11 = view(A, 1:n, 1:n)
+  A12 = view(A, 1:n, n+1:2*n)
+  A21 = view(A, n+1:2*n, 1:n)
+  A22 = view(A, n+1:2*n, n+1:2*n)
+
+  A11_i = inv_strassen(A11)
+  A11_i_A12 = A11_i*A12
+  delta = A22-A21*A11_i_A12
+  X22 = delta_i = inv_strassen(delta)
+  A21_A11_i = A21*A11_i
+  X21 = -delta_i*A21_A11_i
+  X12 = -A11_i_A12*delta_i
+  X11 = A11_i-A11_i_A12*X21
+
+  X = similar(A)
+  ccall((:nmod_mat_set, Hecke.libflint), Nothing, (Ref{fpMatrix}, Ref{fpMatrix}), view(X, 1:n, 1:n), X11)
+  ccall((:nmod_mat_set, Hecke.libflint), Nothing, (Ref{fpMatrix}, Ref{fpMatrix}), view(X, 1:n, n+1:n+n), X12)
+  ccall((:nmod_mat_set, Hecke.libflint), Nothing, (Ref{fpMatrix}, Ref{fpMatrix}), view(X, n+1:n+n, 1:n), X21)
+  ccall((:nmod_mat_set, Hecke.libflint), Nothing, (Ref{fpMatrix}, Ref{fpMatrix}), view(X, n+1:n+n, n+1:n+n), X22)
+
+  return X
+
+  return [X11 X12; X21 X22]
+end
+
+# Triangular denominator: for a solution matrix "s", given denominator d and A= d*s.
+#  Uses presumably the following function
+## [17] gcdx(a::ZZRingElem, b::ZZRingElem)
+##     @ Nemo ~/OSCAR/GIT/Nemo.jl/src/flint/ZZRingElem.jl:1293
+# NOTE: mod(,) seems to return least-non-negative remainder
+# Storjohann
+function hcol(A::ZZMatrix, d::ZZRingElem)  # why not Vector{ZZRingElem}???
+  n = nrows(A)
+  w = deepcopy(A)
+  g = d
+  t = Array(ZZ,n)
+  h = zero_matrix(ZZ,n,n)
+  s = 1
+  v = 1
+  for i = 1:n
+    k = n-(i-1)
+    gg,e,f = gcdx(g,A[k,1])
+    t[k] = f
+    h[k,k] = g/gg
+    g = gg
+  end
+  for i = 1:n
+    if h[i,i] == 1
+      continue
+    end
+    for k = 1:i-1
+      h[k,i] = mod(-t[i]*w[k,1], h[i,i])
+      w[k,1] = mod(w[k,1]+h[k,i]*w[i,1], d)   
+    end
+    w[i,1] = h[i,i]*w[i,1]
+    d = d/h[i,i]
+    divexact!(w, w, h[i,i])
+  end
+  return h
+end
+
+@inline function Nemo.mat_entry_ptr(b::fpMatrix, i::Int, j::Int)
+  unsafe_load(reinterpret(Ptr{Ptr{Culong}}, b.rows), i) + (j-1)*sizeof(Culong)
+end
+
+function induce_crt!(A::ZZMatrix, p::ZZRingElem, B::fpMatrix, ::Integer; signed::Bool = false)
+  #the second modulus is implicit in B: B.n
+
+  ccall((:fmpz_mat_CRT_ui, Nemo.libflint), Cvoid, (Ref{ZZMatrix}, Ref{ZZMatrix}, Ref{ZZRingElem}, Ref{fpMatrix}, Int), A, A, p, B, signed)
+  Nemo.mul!(p, p, B.n)
+  return
+end
+
+function vector_int(a::fpMatrix)
+  b = zeros(Int, nrows(a) * ncols(a))
+  vector_int!(b, a)
+  return b
+end
+
+function vector_int!(a::Vector{Int}, b::fpMatrix)
+  ccall(:memcpy, Cvoid, (Ref{Int}, Ptr{Clong}, Int), a, b.entries, nrows(b)*ncols(b) * 8)
+end
+
+function _map_entries(k::Nemo.fpField, n::Int, m::Int)
+  #create a fpMatrix with julias memory manager,
+  #create a "view" where the view_parent is a julia-Uint array with the
+  #entries
+  # => no finaliser
+  # => matrix(entries) can be used as a 1-dim julia array
+  #    (for broadcast)
+  u = zeros(Int, n*m)
+  a = Nemo.fpMatrix()
+  a.base_ring = k
+  a.entries = reinterpret(Ptr{Cvoid}, pointer(u))
+  v = zeros(UInt, n)
+  for i=1:n
+    v[i] = a.entries + (i-1)*m*8
+  end
+  a.view_parent = (u, v)
+  a.rows = reinterpret(Ptr{Cvoid}, pointer(v))
+  a.r = n
+  a.c = m
+  change_prime(a, UInt(characteristic(k)))
+  return a
+end
+
+
+function _map_entries(k::Nemo.fpField, A::ZZMatrix)
+  #turns A into a fpMatrix with julias memory manager,
+  #create a "view" where the view_parent is a julia-Uint array with the
+  #entries
+  # => no finaliser
+  # => matrix(entries) can be used as a 1-dim julia array
+  #    (for broadcast)
+  a = _map_entries(k, nrows(A), ncols(A))
+  map_entries!(a, k, A)
+  return a
+end
+
+@inline myround(a, p, pi) = @fastmath a-p*Base.rint_llvm(a*pi)
+
+function mod!(A::AbstractArray{Float64}, p::Int)
+  return mod!(A, Float64(p), 1.0/Float64(p))
+end
+
+function mod!(A::AbstractArray{Float64}, pf::Float64, pfi::Float64)
+#  pf = Float64(p)
+#  pfi = 1/pf
+#  A = reshape(A, nrows(A)*ncols(A))
+#  A .= (x->myround(x, pf, pfi)).(A) 
+#  return
+  Ap = pointer(A)
+  @fastmath for i= 1:length(A)
+    a = Base.Intrinsics.pointerref(Ap, 1, 1)
+#    a = unsafe_load(Ap)
+    a -= pf*Base.rint_llvm(a*pfi)
+#     a = Base.modf(a, pf)
+    Base.Intrinsics.pointerset(Ap, a, 1, 1)
+#    unsafe_store!(Ap, a)
+    Ap += sizeof(Float64)
+  end
+end
+
+# Input: B vector of matrices as Double64 meant to be mod q[i]
+# Output: RE vector of matrices ...                       p[i]
+# fast and useful if number of primes is small
+# otherwise: not so much...
+# from hnfproj of Storjohann
+#
+# CRT mod pq: x = a mod p, x = b mod q, 1 = ep+fq
+#  x = fqa + epb
+#
+function change_rns!(RE::Vector{Matrix{Float64}}, p::Vector{Int}, q::Vector{Int}, B::Vector{<:AbstractArray{Float64}}, U::Vector{Matrix{Float64}} = []) 
+  @show length(q), length(p)
+  if length(p) > 40 # also slow, induce_crt! is killing it
+          # to try, for 20 bit primes, to do pairwise CRT using BLAS
+          # or just not use the UniCert_rns ...
+          # actually: if entries are large enough this is winning!q
+          # possibly: do an induce_crt_env where one can push data in
+    x = matrix(ZZ, map(x->ZZ(Int(x)), B[1]))
+    P = ZZ(q[1])
+    y = _map_entries(GF(3, cached = false), size(B[1], 1), size(B[1], 2))
+    for i=2:length(B)
+      y.view_parent[1] .= map(Base.rint_llvm, reshape(B[i], :))
+      change_prime(y, UInt(q[i]))
+      induce_crt!(x::ZZMatrix, P::ZZRingElem, y::fpMatrix, 1; signed = true)
+    end
+    for i=1:length(p)
+      map_entries!(y, GF(p[i], cached = false), x)
+      if length(RE) < i
+        push!(RE, reshape(map(Float64, y.view_parent[1]), size(B[1])))
+      else
+        RE[i] .= reshape(map(Float64, y.view_parent[1]), size(B[1]))
+      end
+    end
+    return 
+  end
+  T = Float64
+  bd = Int[]  # mixed radix rep (wrt to q) of (prod(q)-1)/2
+  pr = mapreduce(ZZ, *, q)
+  pr = div(pr, 2)
+  for i = q
+    push!(bd, pr % i)
+    pr = divexact(pr - bd[end], i)
+  end
+
+  qinv = Int[0]
+  for i=2:length(B)
+    push!(qinv, invmod(Int(mapreduce(ZZ, *, q[1:i-1]) % q[i]), q[i]))
+  end
+  cp = [x-Int(mapreduce(ZZ, *, q) % x) for x = p]
+
+  if length(U) < 1
+    push!(U, copy(B[1]))
+  else
+    U[1] .= B[1]
+  end
+  @fastmath for i=2:length(B)
+    if length(U) < i
+      push!(U, copy(U[i-1]))
+    else
+      U[i] .= U[i-1]
+    end
+    for j=i-2:-1:1
+      BLAS.scal!((q[j] % T(q[i])), U[i])
+      BLAS.axpy!(1.0, U[j], U[i])
+      mod!(U[i], q[i])
+    end
+    BLAS.scal!(T((qinv[i]*(q[i]-1)) % q[i]), U[i])
+    BLAS.axpy!(T(qinv[i]), B[i], U[i])
+    mod!(U[i], q[i])
+  end
+
+  for j = 1:length(p)
+    if length(RE) < j
+      push!(RE, copy(U[length(B)]))
+    else
+      RE[j] .= U[length(B)]
+    end
+    mod!(RE[j], p[j])
+    @fastmath for i=length(B)-1:-1:1
+      BLAS.scal!(T(q[i] % p[j]), RE[j])
+      BLAS.axpy!(1.0, U[i], RE[j])
+      mod!(RE[j], p[j])
+    end
+  end
+
+  for i=1:length(RE[1])
+    for j=1:length(B)
+      if U[j][i] > bd[j]
+        for k=1:length(p)
+          RE[k][i] = (RE[k][i] + cp[k]) % p[k]
+        end
+        break
+      elseif U[j][i] < bd[j]
+        break
+      end
+    end
+  end
+  return RE
+end
+
+# Storjohann's unimodular certification
+# We use CRT to obtain the modular inverse B0
+# The p-adic Hensel-lifting version (below) seems to be faster
+function UniCertZ_CRT_rns(A::ZZMatrix)
+  n = nrows(A);
+  #assume ncols == n
+  H = hadamard_bound2(A)
+  EntrySize = maximum(nbits, A)
+  e = max(16, 2+ceil(Int, 2*log2(n)+EntrySize))
+  println("Modulus X has  $e bits");
+  
+  B0 = Matrix{Float64}[]
+  m = ZZ(1); p = 2^21;  # MAGIC NUMBER (initial prime needs to be such that 
+                        #a full scalar product does not overflow:
+                        #n*(p-1)^2 < 2^53 is my guess
+  Xp = Int[]
+  @time while nbits(m) < e
+    p = next_prime(p);
+    push!(Xp, p)
+    ZZmodP = GF(p, cached = false);
+    MatModP = map_entries(ZZmodP, A)
+    B00 = inv_strassen(MatModP)
+    push!(B0, map(Float64, lift(B00)).entries)
+    Nemo.mul!(m, m, p)
+  end
+  println("Got modular inverses");
+  #m is the X
+
+  Ap = Matrix{Float64}[]
+  m_inv = fpFieldElem[]
+  Y = ZZ(1)
+  Yp = Int[]
+  while nbits(Y) < Int(1+ceil(log2(n)+EntrySize)) 
+    p = next_prime(p)
+    push!(Yp, p)
+    k = GF(p, cached = false)
+    push!(Ap, map(Float64, lift(map_entries(k, A))).entries)
+    push!(m_inv, inv(k(m)))
+    Nemo.mul!(Y, Y, p)
+  end
+
+  # Compute max num iters in k
+  k = (n-1)*(EntrySize+log2(n)/2) - (2*log2(n) + EntrySize);
+  k = k/log2(m);
+  k = 2+ceil(Int, log2(k));
+  println("Max num iters is k=$(k)");
+
+  t = similar(B0[1])
+  RY = Matrix{Float64}[]
+  U = Matrix{Float64}[]
+  change_rns!(RY, Yp, Xp, B0, U)
+  for i = 1:length(Ap)
+    x = Ap[i]
+    t = RY[i]
+    t .= -x*t
+    mod!(t, Yp[i])
+    for j=1:n
+      t[j,j] += 1.0
+    end
+    BLAS.scal!(Float64(lift(m_inv[i])), t)
+    mod!(t, Yp[i])
+  end
+
+  if all(iszero, RY)
+    return true
+  end
+
+  RX = map(similar, B0)
+  MY = map(similar, Ap)
+  R = similar(B0[1])
+  M = similar(B0[1])
+  T = similar(M)
+
+  MX = Matrix{Float64}[]
+
+  for i in 0:k-1
+    println("UniCertZ: loop: i=$(i)");
+    @time change_rns!(RX, Xp, Yp, RY, U)
+    @time for t = 1:length(RX)
+      BLAS.gemm!('n', 'n', 1.0, RX[t], RX[t], 0.0, M)
+      mod!(M, Xp[t])
+      if length(MX) < t
+        push!(MX, B0[t] * M)
+      else
+        BLAS.gemm!('n', 'n', 1.0, B0[t], M, 0.0, MX[t])
+      end
+      mod!(MX[t], Xp[t])
+    end
+    @time change_rns!(MY, Yp, Xp, MX, U)
+    @time for k=1:length(Ap)
+      S = RY[k]
+      M = similar(S)
+      BLAS.gemm!('n', 'n', 1.0, S, S, 0.0, M)
+#      change_rns2!(M, Yp[k], Xp, MX, U)
+#      S .= S*S
+      BLAS.gemm!('n', 'n', -1.0, Ap[k], MY[k], 1.0, M)
+#      S .-= Ap[k] * M
+      mod!(M, Yp[k])
+      BLAS.scal!(Float64(lift(m_inv[k])), M)
+      mod!(M, Yp[k])
+      RY[k], M = M, RY[k]
+    end
+
+    if all(iszero, RY)
+      return true
+    end
+  end #for
+  return false
+end
+
+
+# Storjohann's unimodular certification
+# We use CRT to obtain the modular inverse B0
+# The p-adic Hensel-lifting version (below) seems to be faster
+function UniCertZ_CRT(A::ZZMatrix)
+  n = nrows(A);
+  #assume ncols == n
+  H = hadamard_bound2(A)
+  EntrySize = maximum(abs, A)
+  e = max(16, Int(2+ceil(2*log2(n)+log2(EntrySize))));
+  println("Modulus has  $e bits");
+  
+  B0 = zero_matrix(ZZ,n,n) ## will be computed by crt in loop below
+  m = ZZ(1); p = 2^29;  # MAGIC NUMBER (initial prime, probably should be about 2^29?)
+  @time while (nbits(m) < e)
+    p = next_prime(p);
+    ZZmodP = GF(p, cached = false);
+    MatModP = map_entries(ZZmodP, A)
+    B00 = inv_strassen(MatModP)
+    induce_crt!(B0,m, B00,p, signed = true);  #  also updates:  m = m*p;
+  end
+  println("Got modular inverse");
+
+  # Compute max num iters in k
+  k = (n-1)*(log2(EntrySize)+log2(n)/2) - (2*log2(n) + log2(EntrySize));
+  k = k/log2(m);
+  k = 2+Int(ceil(log2(k)));
+  println("Max num iters is k=$(k)");
+
+  ZZmodM = residue_ring(ZZ,m);
+  B0_modm = map_entries(ZZmodM, B0);
+  I = identity_matrix(ZZ,n)
+  R = (I-A*B0)/m
+  if is_zero(R)
+    return true;
+  end
+  for i in 0:k-1
+    println("UniCertZ: loop: i=$(i)");
+    @time R_bar = R^2;
+    @time M = lift(B0_modm*map_entries(ZZmodM, R_bar));
+    Hecke.mod_sym!(M, m)
+    @time R = (R_bar - A*M)/m;
+    if is_zero(R)
+      return true
+    end
+  end #for
+  return false
+end
+
+function _det(a::fpMatrix)
+  a.r < 10 && return ZZ(lift(det(a)))
+  #_det avoids a copy: det is computed destructively
+  r = ccall((:_nmod_mat_det, Nemo.libflint), UInt, (Ref{fpMatrix}, ), a)
+  return ZZ(r)
+end
+
+
+# Determinant algorithm: U is the range for random RHS matrix (default -100:100)
+# Answer is !!! CORRECT UP TO SIGN !!!
+# should be trivial to fix as we compute some det mod odd p already
+function DetS(A::ZZMatrix, U::AbstractArray= -100:100; use_rns::Bool = false)
+  n = ncols(A)
+  T = ZZMatrix[]
+  AT = A
+  d1 = ZZ(1) # will be the det
+
+  p = UInt(next_prime(2^60))
+  det_p = ZZ(0)
+  prod_p = ZZ(1)
+  Ainf = maximum(nbits, A)
+  Had = div(nbits(hadamard_bound2(A)), 2)+1
+
+  #Dixon cost
+  # lifting has (at least) Had/60 many steps (as the numerator will be
+  # of maximal size)
+  # Each step uses (Ainf+60+n)/60 many modular products
+  #                +1 for lifting
+  #so that should be the number of quadratic steps
+  #
+  function dixon_cost(::ZZRingElem)
+    ceil(Int, (((Ainf+60+n)/60 + 1) * Had/60)/n)
+  end
+  #this should estimate the number of modular dets that will cost
+  #as much as the next dixon step.
+  #if we are close enough to Had, we move to crt.
+  function crt_cost(d::ZZRingElem)
+    return ceil(Int, (Had-nbits(d))/60)
+  end
+  function hnf_cost(d::ZZRingElem)
+    ceil(Int, nbits(d)/60)
+  end
+  function uni_cost(d::ZZRingElem)
+    ceil(Int, log(n)*(2*n+Ainf)/60)
+  end
+
+  f = true
+  local Ap::fpMatrix
+  small = Ainf+n
+  @time while nbits(prod_p) < small + 20
+    if f
+      f = false
+      Ap = map_entries(Nemo.fpField(p, false), A)
+    else
+      change_prime(Ap, p)
+      map_entries!(Ap, Nemo.fpField(p, false), A)
+    end
+    d = _det(Ap)
+    if det_p == 0
+      det_p = d
+    else
+      det_p = crt(det_p, prod_p, d, ZZ(p))
+    end
+    Nemo.mul!(prod_p, prod_p, p)
+    p = next_prime(p)
+  end
+  det_p = mod_sym(det_p, prod_p)
+  det_small = false
+  if nbits(prod_p) - nbits(det_p) > 30
+    @show "det small", det_p
+    det_small = true
+  end
+
+  local D::DixonCtx
+  f = true
+  while !det_small
+    b = rand(matrix_space(ZZ,n,1),U); 
+    if f
+      D = dixon_init(A, b)
+      f = false
+    end
+    @time TS, d = dixon_solve(D, b)
+    for i=1:length(T)
+      TS = T[i]*TS
+    end
+    c = content(TS)
+    g = gcd(c, d)
+    divexact!(TS, TS, g)
+    d = divexact(d, g)
+    println("DetS: loop: nbits(d)=$(nbits(d))");
+
+    @assert !isone(d)  #should never happen, the det_p should catch that
+    isone(d) && break
+    d1 *= d
+    @show  (Had - nbits(d1) - nbits(prod_p))/60
+    if Had - nbits(d1) < nbits(prod_p)
+      @show "at H-bound",  (Had - nbits(d1) - nbits(prod_p))/60
+      return d1
+    end
+    if (Had - nbits(d1) - nbits(prod_p))/60 < dixon_cost(d)
+      @show "finishing with CRT"
+      while nbits(prod_p)+nbits(d1) < Had
+        change_prime(Ap, p)
+        map_entries!(Ap, Nemo.fpField(p, false), A)
+        d = _det(Ap)
+        det_p = crt(det_p, prod_p, d, ZZ(p))
+        Nemo.mul!(prod_p, prod_p, ZZ(p))
+      end
+      mis = mod_sym(det_p*invmod(d1, prod_p), prod_p)
+      return d1*mis
+    end
+    @time T1 = hcol(TS, d)  
+    push!(T, T1)
+    @show :solve
+    @time AT = Strassrn.solve_triu(T1, AT)
+#    @show nbits(prod_p), nbits(d1)
+#    @show nbits(abs(mod_sym(invmod(d1, prod_p)*det_p, prod_p)))
+    if nbits(abs(mod_sym(invmod(d1, prod_p)*det_p, prod_p))) < small 
+      break
+    end
+#    @show nbits(d), Ainf
+    if nbits(d) < Ainf + n
+      @show :doin_hnf
+      h = hnf_modular_eldiv(AT, d)
+      d1 *= prod([h[i,i] for i=1:n])
+    @show Had / nbits(d1)  
+      AT = Strassen.solve_triu(h, AT)
+      if nbits(abs(mod_sym(invmod(d1, prod_p)*det_p, prod_p))) < small
+        break
+      end
+    end
+  end
+  det_p = invmod(d1, prod_p)*det_p
+  @show det_p = mod_sym(det_p, prod_p)
+  @assert nbits(abs(det_p)) < small
+  h = hnf_modular_eldiv(AT, det_p)
+  @show det(h) // det_p, det(h)
+  d1 *= det(h)
+
+  AT = Strassen.solve_triu(h, AT)
+    println("DOING UNICERTZ");
+    @show uni_cost(d1)
+    @show crt_cost(d1)
+    #need to cost this as well
+
+    if use_rns
+      @time fl = UniCertZ_CRT_rns(AT)
+    else
+      @time fl = UniCertZ_CRT(AT)
+    end
+    #           @assert fl == is_unit(det(ZAT));
+    if fl
+      return d1
+    end
+    return d1
+    error("unimod failed")
+end
+
+#adaptive rational_reconstruction: if solution is unbalanced with
+#denominator smaller than numerator
+function induce_rational_reconstruction(a::ZZMatrix, b::ZZRingElem)
+  A = similar(a)
+  D = ZZ(1)
+  T = ZZ()
+  n = ZZ()
+  d = ZZ()
+  for i=1:nrows(a)
+    a_ptr = Nemo.mat_entry_ptr(a, i, 1)
+    A_ptr = Nemo.mat_entry_ptr(A, i, 1)
+    for j=1:ncols(a)
+#      @show i, j
+      ccall((:fmpz_set, Nemo.libflint), Cvoid, (Ref{ZZRingElem}, Ptr{ZZRingElem}), T, a_ptr)
+      Nemo.mul!(T, T, D)
+      Nemo.mod!(T, T, b)
+      fl = ratrec!(n, d, T, b)
+      fl || return fl, A, D
+      if !isone(d)
+        D = D*d
+        Nemo.mul!(A, A, d)
+      end
+      ccall((:fmpz_set, Nemo.libflint), Cvoid, (Ptr{ZZRingElem}, Ref{ZZRingElem}), A_ptr, n)
+
+      a_ptr += sizeof(ZZRingElem)
+      A_ptr += sizeof(ZZRingElem)
+    end
+  end
+  return true, A, D
+end
+
+function Nemo.div!(a::ZZRingElem, b::ZZRingElem, c::ZZRingElem)
+  ccall((:fmpz_tdiv_q, Nemo.libflint), Cvoid, (Ref{ZZRingElem}, Ref{ZZRingElem}, Ref{ZZRingElem}), a, b, c)
+end
+
+function shift_right!(a::ZZRingElem, b::ZZRingElem, i::Int)
+  ccall((:fmpz_fdiv_q_2exp, Nemo.libflint), Cvoid, (Ref{ZZRingElem}, Ref{ZZRingElem}, Int), a, b, i)
+end
+
+#output sensitive rational_reconstruction, in particular if
+#numerator is larger than den as in the induced case above
+function ratrec!(n::ZZRingElem, d::ZZRingElem, a::ZZRingElem, b::ZZRingElem)
+  k = nbits(b)
+  l = 1
+  N = deepcopy(b)
+  D = ZZ(2)
+
+#  @assert 0<a<b
+  done = false
+  while !done && D <= N
+    Nemo.mul!(D, D, D)
+    div!(N, b, D)
+    shift_right!(N, N, 1)
+    if D>N
+      D = N = iroot(b, 2)
+      D = div(D, 2)
+      done = true
+    end
+
+#    @assert 2*N*D < b
+
+    fl = ccall((:_fmpq_reconstruct_fmpz_2, Nemo.libflint), Bool, (Ref{ZZRingElem}, Ref{ZZRingElem}, Ref{ZZRingElem}, Ref{ZZRingElem}, Ref{ZZRingElem}, Ref{ZZRingElem}), n, d, a, b, N, D)
+
+    if fl && (nbits(n)+nbits(d) < k - 30 || D>N)
+      return fl
+    end
+    l += 1
+  end
+  return false
+end
+
+function Hecke.map_entries(k::Nemo.fpField, A::ZZMatrix)
+  a = zero_matrix(k, nrows(A), ncols(A))
+  return map_entries!(a, k, A)
+end
+
+function map_entries!(a::fpMatrix, k::Nemo.fpField, A::ZZMatrix)
+  ccall((:fmpz_mat_get_nmod_mat, Nemo.libflint), Cvoid, (Ref{fpMatrix}, Ref{ZZMatrix}), a, A)
+  return a
+end
+
+function change_prime(a::fpMatrix, p::UInt)
+  a.n = p  #in flint 3.0 available as nmod_mat_set_mod
+  a.norm = leading_zeros(p)
+  a.ninv = ccall((:n_preinvert_limb_prenorm, Nemo.libflint), UInt, (UInt, ), p << a.norm)
+end
+
+function lift!(A::ZZMatrix, a::fpMatrix)
+  ccall((:fmpz_mat_set_nmod_mat, Nemo.libflint), Cvoid, (Ref{ZZMatrix}, Ref{fpMatrix}), A, a)
+end
+
+mutable struct DixonCtx
+  bound::ZZRingElem
+  Ainv::fpMatrix
+  p::UInt
+  crt_primes::Vector{UInt}
+  A_mod::Vector{fpMatrix}
+  d_mod::fpMatrix
+  y_mod::fpMatrix
+  Ay_mod::fpMatrix
+  Ay::ZZMatrix
+  x::ZZMatrix
+  d::ZZMatrix
+  A::ZZMatrix
+  function DixonCtx()
+    return new()
+  end
+end
+#copied from flint to allow the use of adaptive reconstruction,
+#support cases with small primes and Float64
+function dixon_init(A::ZZMatrix, B::ZZMatrix)
+  D = DixonCtx()
+  D.A = A
+
+  n = nrows(A)
+  cols = ncols(B)
+
+  _N = ZZ()
+  _D = ZZ()
+  ccall((:fmpz_mat_solve_bound, Nemo.libflint), Cvoid, (Ref{ZZRingElem}, Ref{ZZRingElem}, Ref{ZZMatrix}, Ref{ZZMatrix}), _N, _D, A, B)
+  Ainv = zero_matrix(GF(2, cached = false), n, n)
+  p = ccall((:fmpz_mat_find_good_prime_and_invert, Nemo.libflint), UInt, (Ref{fpMatrix}, Ref{ZZMatrix}, Ref{ZZRingElem}), Ainv, A, _D)
+  @assert p != 0
+  D.p = p
+  D.Ainv = Ainv
+
+  D.bound = 2*maximum(abs, [_D, _N])^2 * 2^30
+  D.crt_primes = UInt[]
+  D.A_mod = fpMatrix[]
+ 
+  pr = ZZ(1)
+  xp = p
+  maxA = maximum(abs, A) *(p-1)*n*2
+
+  while true
+    push!(D.crt_primes, xp)
+    push!(D.A_mod, map_entries(Nemo.fpField(xp, false), A))
+    pr *= xp
+    if pr > maxA
+      break
+    end
+    xp = next_prime(xp)
+  end
+
+  k = Nemo.fpField(p, false)
+  D.d_mod = zero_matrix(k, nrows(B), ncols(B))
+  D.y_mod = zero_matrix(k, nrows(B), ncols(B))
+  D.Ay_mod = zero_matrix(k, nrows(B), ncols(B))
+  D.Ay = zero_matrix(ZZ, nrows(B), ncols(B))
+  D.x = zero_matrix(ZZ, nrows(B), ncols(B))
+
+  return D
+end
+
+function dixon_solve(D::DixonCtx, B::ZZMatrix)
+  zero!(D.x)
+  d = deepcopy(B)
+  ppow = ZZ(1)
+  i = 1
+  nexti = 1
+  while ppow <= D.bound
+    map_entries!(D.d_mod, Nemo.fpField(D.p, false), d)
+    Nemo.mul!(D.y_mod, D.Ainv, D.d_mod)
+
+    ccall((:fmpz_mat_scalar_addmul_nmod_mat_fmpz, Nemo.libflint), Cvoid, (Ref{ZZMatrix}, Ref{fpMatrix}, Ref{ZZRingElem}), D.x, D.y_mod, ppow)
+
+    Nemo.mul!(ppow, ppow, D.p)
+    if ppow > D.bound
+      break
+    end
+
+    stabilised = i == nexti;
+
+    if stabilised
+      nexti = ceil(Int,(i*1.4)) + 1;
+      #maybe col by col? to stop doing cols that are already there?
+      #main use currently is 1 col anyway
+      fl, num, den = induce_rational_reconstruction(D.x, ppow)
+
+      if fl
+        # fl = (D.A*num == den*B)
+        sz = max(maximum(nbits, D.A) + maximum(nbits, num) 
+                                     + nbits(ncols(B)) + 1, 
+                 maximum(nbits, B) + nbits(den))
+        if sz < nbits(ppow)
+#          @show "no prod neccessary"
+        else
+          xp = next_prime(D.p)
+#          @show ceil(Int, (sz - nbits(ppow))/60)
+          for i=1:ceil(Int, (sz - nbits(ppow))/60)
+            k = Nemo.fpField(xp, false)
+            Ap = map_entries(k, D.A)
+            Bp = map_entries(k, B)
+            nump = map_entries(k, num)
+            fl = Ap*nump == Bp*k(den)
+            fl || break
+            xp = next_prime(xp)
+          end
+        end
+#        @show fl
+        fl && return num, den
+      end
+    end
+  
+    i += 1
+
+    prod = ZZ(1)
+    if false
+      Nemo.mul!(D.Ay, D.A, lift(D.y_mod))
+    else
+      for j=1:length(D.crt_primes)
+        change_prime(D.y_mod, D.crt_primes[j])
+        change_prime(D.Ay_mod, D.crt_primes[j])
+
+        Nemo.mul!(D.Ay_mod, D.A_mod[j], D.y_mod)
+        if j == 1
+          lift!(D.Ay, D.Ay_mod)
+          prod = ZZ(D.crt_primes[j])
+        else
+          induce_crt!(D.Ay, prod, D.Ay_mod, D.crt_primes[j], signed = true)
+        end
+      end
+      change_prime(D.y_mod, D.p)
+    end
+    sub!(d, d, D.Ay)
+    divexact!(d, d, ZZ(D.p))
+  end
+  fl, num, den = ratrec(D.x, ppow)
+
+  @assert fl
+#  @time @assert D.A*num == den*B
+  return num, den
+end
+
+end # module
diff --git a/examples/strassen.jl b/examples/strassen.jl
index c065ddf410..076e9b0293 100644
--- a/examples/strassen.jl
+++ b/examples/strassen.jl
@@ -11,7 +11,7 @@
     (at your option) any later version.  See <http://www.gnu.org/licenses/>.
 
     to julia:
-    Copyright (C) 2020, Claus Fieker
+    Copyright (C) 2020,2023 Claus Fieker
 =#
 
 module Strassen
@@ -22,47 +22,7 @@ import Hecke.Nemo: add!, mul!, zero!, sub!, solve_triu!, solve_tril!
 
 const cutoff = 1500
 
-#creates an unimodular n x n matrix where the inverse is much larger
-#than the original matrix. Entries are chosen in U
-function rand_mat(n::Int, U::AbstractArray=1:10)
-  C = zero_matrix(ZZ, n, n)
-  for i=1:n-1
-    C[i, i+1] = 1
-  end
-  for i=2:n
-    C[n, i] = rand(U)
-  end
-  C[n,1] = 1
-  for i=2:n-1
-    C[i,i] = rand(U)
-  end
-  return C
-end
-
-function perturb!(A, n::Int = 100, c::AbstractRange = -10:10)
-  for i=1:n 
-    x = rand(c)
-    while iszero(x)
-      x = rand(c)
-    end
-    if rand(0:1) == 1
-      k = rand(1:nrows(A))
-      l = rand(1:nrows(A))
-      while l == k
-        l = rand(1:nrows(A))
-      end
-      A[k, :] += x*A[l, :]
-    else
-      k = rand(1:ncols(A))
-      l = rand(1:ncols(A))
-      while l == k
-        l = rand(1:ncols(A))
-      end
-      A[:, k] += x*A[:, l]
-    end
-  end
-end
-
+#base case for the strassen
 function Nemo.mul!(C::AbstractArray, A::AbstractArray, B::AbstractArray, add::Bool = false)
   @assert size(C) == (2,2) && size(A) == (2,2) && size(B) == (2,2)
   C[1,1] = A[1,1] * B[1,1] + A[1,2] * B[2,1]
@@ -236,1163 +196,6 @@ function mul_strassen!(C::AbstractArray, A::AbstractArray, B::AbstractArray)
   end
 end
 
-function mul_strassen!(A::Generic.Mat, B::Generic.Mat, C::Generic.Mat)
-  mul_strassen!(C.entries, A.entries, B.entries)
-end
-
-function zero!(A::MatElem{T}) where {T}
-  error("not done yet")
-end
-
-#function zero!(A::fpMatrix)
-#  ccall((:nmod_mat_zero, Nemo.libflint), Cvoid, (Ref{fpMatrix}, ), A)
-#end
-
-#function zero!(A::ZZMatrix)
-#  ccall((:fmpz_mat_zero, Nemo.libflint), Cvoid, (Ref{ZZMatrix}, ), A)
-#end
-#
-#function sub!(A::ZZMatrix, B::ZZMatrix, C::ZZMatrix)
-#  ccall((:fmpz_mat_sub, Nemo.libflint), Cvoid, (Ref{ZZMatrix}, Ref{ZZMatrix}, Ref{ZZMatrix}), A, B, C)
-#end
-
-#function sub!(A::fpMatrix, B::fpMatrix, C::fpMatrix)
-#  ccall((:fmpz_mat_sub, Nemo.libflint), Cvoid, (Ref{fpMatrix}, Ref{fpMatrix}, Ref{fpMatrix}), A, B, C)
-#end
-
-function mul_strassen(A::MatElem{T}, B::MatElem{T}; cutoff::Int = cutoff) where {T}
-  C = zero_matrix(base_ring(A), nrows(A), ncols(B))
-  mul_strassen!(C, A, B; cutoff)
-  return C
-end
-
-function mul_strassen!(C::MatElem{T}, A::MatElem{T}, B::MatElem{T}; cutoff::Int = cutoff) where {T}
-  sA = size(A)
-  sB = size(B)
-  sC = size(C)
-  a = sA[1]
-  b = sA[2]
-  c = sB[2]
-
-  @assert a == sC[1] && b == sB[1] && c == sC[2]
-
-  if (a <= cutoff || b <= cutoff || c <= cutoff)
-      Nemo.mul!(C, A, B)
-      return
-  end
-
-  anr = div(a, 2)
-  anc = div(b, 2)
-  bnr = anc
-  bnc = div(c, 2)
-
-  #nmod_mat_window_init(A11, A, 0, 0, anr, anc);
-  #nmod_mat_window_init(A12, A, 0, anc, anr, 2*anc);
-  #nmod_mat_window_init(A21, A, anr, 0, 2*anr, anc);
-  #nmod_mat_window_init(A22, A, anr, anc, 2*anr, 2*anc);
-  A11 = view(A, 1:anr, 1:anc)
-  A12 = view(A, 1:anr, anc+1:2*anc)
-  A21 = view(A, anr+1:2*anr, 1:anc)
-  A22 = view(A, anr+1:2*anr, anc+1:2*anc)
-
-  #nmod_mat_window_init(B11, B, 0, 0, bnr, bnc);
-  #nmod_mat_window_init(B12, B, 0, bnc, bnr, 2*bnc);
-  #nmod_mat_window_init(B21, B, bnr, 0, 2*bnr, bnc);
-  #nmod_mat_window_init(B22, B, bnr, bnc, 2*bnr, 2*bnc);
-  B11 = view(B, 1:bnr, 1:bnc)
-  B12 = view(B, 1:bnr, bnc+1:2*bnc)
-  B21 = view(B, bnr+1:2*bnr, 1:bnc)
-  B22 = view(B, bnr+1:2*bnr, bnc+1:2*bnc)
-
-  #nmod_mat_window_init(C11, C, 0, 0, anr, bnc);
-  #nmod_mat_window_init(C12, C, 0, bnc, anr, 2*bnc);
-  #nmod_mat_window_init(C21, C, anr, 0, 2*anr, bnc);
-  #nmod_mat_window_init(C22, C, anr, bnc, 2*anr, 2*bnc);
-  C11 = view(C, 1:anr, 1:bnc)
-  C12 = view(C, 1:anr, bnc+1:2*bnc)
-  C21 = view(C, anr+1:2*anr, 1:bnc)
-  C22 = view(C, anr+1:2*anr, bnc+1:2*bnc)
-
-  #nmod_mat_init(X1, anr, FLINT_MAX(bnc, anc), A->mod.n);
-  #nmod_mat_init(X2, anc, bnc, A->mod.n);
-
-  #X1->c = anc;
-
-  #=
-      See Jean-Guillaume Dumas, Clement Pernet, Wei Zhou; "Memory
-      efficient scheduling of Strassen-Winograd's matrix multiplication
-      algorithm"; http://arxiv.org/pdf/0707.2347v3 for reference on the
-      used operation scheduling.
-  =#
-
-  X1 = A11 - A21
-  X2 = B22 - B12
-  #nmod_mat_mul(C21, X1, X2);
-  mul_strassen!(C21, X1, X2; cutoff)
-
-  add!(X1, A21, A22);
-  sub!(X2, B12, B11);
-  #nmod_mat_mul(C22, X1, X2);
-  mul_strassen!(C22, X1, X2; cutoff)
-
-  sub!(X1, X1, A11);
-  sub!(X2, B22, X2);
-  #nmod_mat_mul(C12, X1, X2);
-  mul_strassen!(C12, X1, X2; cutoff)
-
-  sub!(X1, A12, X1);
-  #nmod_mat_mul(C11, X1, B22);
-  mul_strassen!(C11, X1, B22; cutoff)
-
-  #X1->c = bnc;
-  #nmod_mat_mul(X1, A11, B11);
-  zero!(X1)
-  mul_strassen!(X1, A11, B11; cutoff)
-
-  add!(C12, X1, C12);
-  add!(C21, C12, C21);
-  add!(C12, C12, C22);
-  add!(C22, C21, C22);
-  add!(C12, C12, C11);
-  sub!(X2, X2, B21);
-  #nmod_mat_mul(C11, A22, X2);
-  mul_strassen!(C11, A22, X2; cutoff)
-
-  sub!(C21, C21, C11);
-
-  #nmod_mat_mul(C11, A12, B21);
-  mul_strassen!(C11, A12, B21; cutoff)
-
-  add!(C11, X1, C11);
-
-  if c > 2*bnc #A by last col of B -> last col of C
-      #nmod_mat_window_init(Bc, B, 0, 2*bnc, b, c);
-      Bc = view(B, 1:b, 2*bnc+1:c)
-      #nmod_mat_window_init(Cc, C, 0, 2*bnc, a, c);
-      Cc = view(C, 1:a, 2*bnc+1:c)
-      #nmod_mat_mul(Cc, A, Bc);
-      Nemo.mul!(Cc, A, Bc)
-  end
-
-  if a > 2*anr #last row of A by B -> last row of C
-      #nmod_mat_window_init(Ar, A, 2*anr, 0, a, b);
-      Ar = view(A, 2*anr+1:a, 1:b)
-      #nmod_mat_window_init(Cr, C, 2*anr, 0, a, c);
-      Cr = view(C, 2*anr+1:a, 1:c)
-      #nmod_mat_mul(Cr, Ar, B);
-     Nemo.mul!(Cr, Ar, B)
-  end
-
-  if b > 2*anc # last col of A by last row of B -> C
-      #nmod_mat_window_init(Ac, A, 0, 2*anc, 2*anr, b);
-      Ac = view(A, 1:2*anr, 2*anc:b)
-      #nmod_mat_window_init(Br, B, 2*bnr, 0, b, 2*bnc);
-      Br = view(B, 2*bnr+1:b, 1:2*bnc)
-      #nmod_mat_window_init(Cb, C, 0, 0, 2*anr, 2*bnc);
-      Cb = view(C, 1:2*anr, 1:2*bnc)
-      #nmod_mat_addmul(Cb, Cb, Ac, Br);
-      Nemo.mul!(Cb, Ac, Br, true)
-  end
-end
-
-function apply!(A::fpMatrix, P::Perm{Int}; offset::Int = 0)
-  n = length(P.d)
-  t = zeros(Int, n-offset)
-  for i=1:n-offset
-    t[i] = unsafe_load(reinterpret(Ptr{Int}, A.rows), P.d[i] + offset)
-  end
-  for i=1:n-offset
-    unsafe_store!(reinterpret(Ptr{Int}, A.rows), t[i], i + offset)
-  end
-end
-
-function apply!(Q::Perm{Int}, P::Perm{Int}; offset::Int = 0)
-  n = length(P.d)
-  t = zeros(Int, n-offset)
-  for i=1:n-offset
-    t[i] = Q.d[P.d[i] + offset]
-  end
-  for i=1:n-offset
-    Q.d[i + offset] = t[i]
-  end
-end
-
-#takes the result of lu! and turns it into a proper lu
-function Nemo.lu(P::Perm, U, r::Int)
-  m = nrows(U)
-  n = ncols(U)
-  R = base_ring(U)
-  L = similar(U, m, m)
-
-  for i = 1:m
-    for j = 1:n
-      if i > j
-        L[i, j] = U[i, j]
-        U[i, j] = R(0)
-      elseif i == j
-        L[i, j] = R(1)
-      elseif j <= m
-        L[i, j] = R(0)
-      end
-    end
-  end
-  return r, P, L, U
-end
-
-function lu!_strassen(P::Perm{Int}, Ai; cutoff::Int = 300)
-  m = nrows(A)
-
-  @assert length(P.d) == m
-  n = ncols(A)
-  if n < cutoff
-    return lu!(P, A)
-  end
-  n1 = div(n, 2)
-  for i=1:m
-    P.d[i] = i
-  end
-  P1 = Hecke.AbstractAlgebra.Perm(m)
-  A0 = view(A, 1:m, 1:n1)
-  r1 = lu!_strassen(P1, A0; cutoff)
-  @assert r1 == n1
-  if r1 > 0 
-    apply!(A, P1)
-    apply!(P, P1)
-  end
-
-  A00 = view(A, 1:r1, 1:r1)
-  A10 = view(A, r1+1:m, 1:r1)
-  A01 = view(A, 1:r1, n1+1:n)
-  A11 = view(A, r1+1:m, n1+1:n)
-
-  if r1 > 0
-    #Note: A00 is a view of A0 thus a view of A
-    # A0 is lu!, thus implicitly two triangular matrices giving the
-    # lu decomosition. solve_tril! looks ONLY at the lower part of A00
-    Nemo.solve_tril!(A01, A00, A01, 1)
-    X = A10 * A01
-    Nemo.sub!(A11, A11, X)
-  end
-
-  P1 = Perm(nrows(A11))
-  r2 = lu!_strassen(P1, A11)
-  apply!(A, P1, offset = r1)
-  apply!(P, P1, offset = r1)
-
-  if (r1 != n1)
-    for i=1:m-r1
-      for j=1:min(i, r2)
-        A[r1+i-1, r1+j-1] = A[r1+i-1, n1+j-1]
-        A[r1+i-1, n1+j-1] = 0
-      end
-    end
-  end
-  return r1 + r2
-end
-
-#
-# see https://arxiv.org/pdf/1901.00904.pdf
-# p2
-#
-#  A B  = I       0  * A 0 * I A^-1 B
-#  C D   C A^-1   I    0 X   0  I
-#
-# for X = D - C A^-1 B
-#
-# inv( I A; 0 I) = I -A; 0 I
-#
-# TODO:
-#    inplace
-#    pre-alloced tmp for the recursion
-#
-function inv_strassen(A)
-#  @show size(A)
-  if isodd(ncols(A)) || ncols(A) < 400
-    return inv(A)
-  end
-#  @show :strassen, ncols(A)
-  @assert nrows(A) == ncols(A)
-  @assert iseven(ncols(A))
-  n = div(ncols(A), 2)
-  A11 = view(A, 1:n, 1:n)
-  A12 = view(A, 1:n, n+1:2*n)
-  A21 = view(A, n+1:2*n, 1:n)
-  A22 = view(A, n+1:2*n, n+1:2*n)
-
-#  I = identity_matrix(base_ring(A), n)
-#  Z = zero_matrix(base_ring(A), n, n)
-
-
-  A11_i = inv_strassen(A11)
-  A11_i_A12 = A11_i*A12
-  delta = A22-A21*A11_i_A12
-  X22 = delta_i = inv_strassen(delta)
-  A21_A11_i = A21*A11_i
-  X21 = -delta_i*A21_A11_i
-  X12 = -A11_i_A12*delta_i
-  X11 = A11_i-A11_i_A12*X21
-
-  X = similar(A)
-  ccall((:nmod_mat_set, Hecke.libflint), Nothing, (Ref{fpMatrix}, Ref{fpMatrix}), view(X, 1:n, 1:n), X11)
-  ccall((:nmod_mat_set, Hecke.libflint), Nothing, (Ref{fpMatrix}, Ref{fpMatrix}), view(X, 1:n, n+1:n+n), X12)
-  ccall((:nmod_mat_set, Hecke.libflint), Nothing, (Ref{fpMatrix}, Ref{fpMatrix}), view(X, n+1:n+n, 1:n), X21)
-  ccall((:nmod_mat_set, Hecke.libflint), Nothing, (Ref{fpMatrix}, Ref{fpMatrix}), view(X, n+1:n+n, n+1:n+n), X22)
-
-  return X
-
-  return [X11 X12; X21 X22]
-end
-
-function solve_triu_strassen(T::MatElem, b::MatElem; cutoff::Int = cutoff)
-  #b*inv(T), thus solves Tx = b for T upper triangular
-  n = ncols(T)
-  if n <= 50
-    R = solve_triu(T, b)
-    return R
-  end
-
-  n2 = div(n, 2) + n % 2
-  m = nrows(b)
-  m2 = div(m, 2) + m % 2
-
-  U = view(b, 1:m2, 1:n2)
-  V = view(b, 1:m2, n2+1:n)
-  X = view(b, m2+1:m, 1:n2)
-  Y = view(b, m2+1:m, n2+1:n)
-
-  A = view(T, 1:n2, 1:n2)
-  B = view(T, 1:n2, 1+n2:n)
-  C = view(T, 1+n2:n, 1+n2:n)
-
-  S = solve_triu_strassen(A, U; cutoff)
-  R = solve_triu_strassen(A, X; cutoff)
-
-  SS = mul_strassen(S, B; cutoff)
-  sub!(SS, V, SS)
-  SS = solve_triu_strassen(C, SS; cutoff)
-
-  RR = mul_strassen(R, B; cutoff)
-  sub!(RR, Y, RR)
-  RR = solve_triu_strassen(C, RR; cutoff)
-
-  return [S SS; R RR]
-end
-
-function AbstractAlgebra.solve_triu(U::ZZMatrix, b::ZZMatrix)
-   n = ncols(U)
-   m = nrows(b)
-   R = base_ring(U)
-   X = zero(b)
-   tmp = zero_matrix(ZZ, 1, n)
-   t = R()
-   s = R()
-   for i = 1:m
-      tmp_p = Nemo.mat_entry_ptr(tmp, 1, 1)
-      X_p = Nemo.mat_entry_ptr(X, i, 1)
-      for j = 1:n
-         ccall((:fmpz_set, Nemo.libflint), Cvoid, (Ptr{fmpz}, Ptr{fmpz}), tmp_p, X_p)
-         X_p += sizeof(ZZRingElem)
-         tmp_p += sizeof(ZZRingElem)
-      end
-      for j = 1:n
-         ccall((:fmpz_zero, Nemo.libflint), Cvoid, (Ref{fmpz}, ), s) 
-
-         tmp_p = Nemo.mat_entry_ptr(tmp, 1, 1)
-         for k = 1:j-1
-            U_p = Nemo.mat_entry_ptr(U, k, j)
-            ccall((:fmpz_addmul, Nemo.libflint), Cvoid, (Ref{fmpz}, Ptr{fmpz}, Ptr{fmpz}), s, U_p, tmp_p)
-            tmp_p += sizeof(ZZRingElem)
-         end
-         ccall((:fmpz_sub, Nemo.libflint), Cvoid, 
-          (Ref{fmpz}, Ptr{fmpz}, Ref{fmpz}), s, Nemo.mat_entry_ptr(b, i, j), s)
-         ccall((:fmpz_divexact, Nemo.libflint), Cvoid, 
-          (Ptr{fmpz}, Ref{fmpz}, Ptr{fmpz}), Nemo.mat_entry_ptr(tmp, 1, j), s, Nemo.mat_entry_ptr(U, j, j))
-      end
-      tmp_p = Nemo.mat_entry_ptr(tmp, 1, 1)
-      X_p = Nemo.mat_entry_ptr(X, i, 1)
-      for j = 1:n
-         ccall((:fmpz_set, Nemo.libflint), Cvoid, (Ptr{fmpz}, Ptr{fmpz}), X_p, tmp_p)
-         X_p += sizeof(ZZRingElem)
-         tmp_p += sizeof(ZZRingElem)
-      end
-   end
-   return X
-end
-
-function Hecke.hadamard_bound2(M::ZZMatrix)
-  is_square(M) || error("Matrix must be square")
-  H = ZZ(1);
-  r = ZZ(0)
-  n = nrows(M)
-  for i in 1:n
-    ccall((:fmpz_set_si, Nemo.libflint), Cvoid, (Ref{ZZRingElem}, Int), r, 0)
-    M_ptr = Nemo.mat_entry_ptr(M, i, 1)
-    for j in 1:n
-      ccall((:fmpz_addmul, Nemo.libflint), Cvoid, (Ref{ZZRingElem}, Ptr{ZZRingElem}, Ptr{ZZRingElem}), r, M_ptr, M_ptr)
-      M_ptr += sizeof(ZZRingElem)
-    end
-    if iszero(r)
-      return r
-    end
-    Nemo.mul!(H, H, r)
-  end
-  return H
-end
-
-function Base.maximum(::typeof(nbits), M::ZZMatrix)
-  mx = 0
-  n = nrows(M)
-  m = ncols(M)
-  M_ptr = Nemo.mat_entry_ptr(M, 1, 1)
-  for i in 1:n
-    for j in 1:m
-      if !iszero(unsafe_load(reinterpret(Ptr{Int}, M_ptr)))
-        mx = max(mx, ccall((:fmpz_bits, Nemo.libflint), Int, (Ptr{fmpz},), M_ptr))
-      end
-      M_ptr += sizeof(ZZRingElem)
-    end
-  end
-  return mx
-end
-
-
-# Triangular denominator: for a solution matrix "s", given denominator d and A= d*s.
-#  Uses presumably the following function
-## [17] gcdx(a::ZZRingElem, b::ZZRingElem)
-##     @ Nemo ~/OSCAR/GIT/Nemo.jl/src/flint/fmpz.jl:1293
-# NOTE: mod(,) seems to return least-non-negative remainder
-function hcol(A::ZZMatrix, d::ZZRingElem)  # why not Vector{ZZRingElem}???
-  n = nrows(A)
-  w = deepcopy(A)
-  g = d
-  t = Array(ZZ,n)
-  h = zero_matrix(ZZ,n,n)
-  s = 1
-  v = 1
-  for i = 1:n
-    k = n-(i-1)
-    gg,e,f = gcdx(g,A[k,1])
-    t[k] = f
-    h[k,k] = g/gg
-    g = gg
-  end
-  for i = 1:n
-    if h[i,i] == 1
-      continue
-    end
-    for k = 1:i-1
-      h[k,i] = mod(-t[i]*w[k,1], h[i,i])
-      w[k,1] = mod(w[k,1]+h[k,i]*w[i,1], d)   
-    end
-    w[i,1] = h[i,i]*w[i,1]
-    d = d/h[i,i]
-    divexact!(w, w, h[i,i])
-  end
-  return h
-end
-
-function infinity_norm(A::ZZMatrix)
-  return maximum(abs, A)
-end
-
-@inline function Nemo.mat_entry_ptr(b::fpMatrix, i::Int, j::Int)
-  unsafe_load(reinterpret(Ptr{Ptr{Culong}}, b.rows), i) + (j-1)*sizeof(Culong)
-end
-
-function induce_crt!(A::ZZMatrix, p::ZZRingElem, B::fpMatrix, ::Integer; signed::Bool = false)
-  #the second modulus is implicit in B: B.n
-
-  ccall((:fmpz_mat_CRT_ui, Nemo.libflint), Cvoid, (Ref{ZZMatrix}, Ref{ZZMatrix}, Ref{ZZRingElem}, Ref{fpMatrix}, Int), A, A, p, B, signed)
-  Nemo.mul!(p, p, B.n)
-  return
-end
-
-function vector_int(a::fpMatrix)
-  b = zeros(Int, nrows(a) * ncols(a))
-  vector_int!(b, a)
-  return b
-end
-
-function vector_int!(a::Vector{Int}, b::fpMatrix)
-  ccall(:memcpy, Cvoid, (Ref{Int}, Ptr{Clong}, Int), a, b.entries, nrows(b)*ncols(b) * 8)
-end
-
-function _map_entries(k::Nemo.fpField, A::ZZMatrix)
-  #turns A into a fpMatrix with julias memory manager,
-  #create a "view" where the view_parent is a julia-Uint array with the
-  #entries
-  # => no finaliser
-  # => matrix(entries) can be used as a 1-dim julia array
-  #    (for broadcast)
-  u = zeros(Int, nrows(A)*ncols(A))
-  a = Nemo.fpMatrix()
-  a.base_ring = k
-  a.entries = reinterpret(Ptr{Cvoid}, pointer(u))
-  v = zeros(UInt, nrows(A))
-  for i=1:nrows(A)
-    v[i] = a.entries + (i-1)*ncols(A)*8
-  end
-  a.view_parent = (u, v)
-  a.rows = reinterpret(Ptr{Cvoid}, pointer(v))
-  a.r = nrows(A)
-  a.c = ncols(A)
-  change_prime(a, UInt(characteristic(k)))
-  map_entries!(a, k, A)
-  return a
-end
-
-@inline myround(a, p, pi) = @fastmath a-p*Base.rint_llvm(a*pi)
-
-function Nemo.mod!(A::AbstractArray{Float64}, p::Int)
-  return Nemo.mod!(A, Float64(p), 1.0/Float64(p))
-end
-
-function Nemo.mod!(A::AbstractArray{Float64}, pf::Float64, pfi::Float64)
-#  pf = Float64(p)
-#  pfi = 1/pf
-#  A = reshape(A, nrows(A)*ncols(A))
-#  A .= (x->myround(x, pf, pfi)).(A) 
-#  return
-  Ap = pointer(A)
-  @fastmath for i= 1:length(A)
-    a = Base.Intrinsics.pointerref(Ap, 1, 1)
-#    a = unsafe_load(Ap)
-    a -= pf*Base.rint_llvm(a*pfi)
-#     a = Base.modf(a, pf)
-    Base.Intrinsics.pointerset(Ap, a, 1, 1)
-#    unsafe_store!(Ap, a)
-    Ap += sizeof(Float64)
-  end
-end
-
-# Input: B vector of matrices as Double64 meant to be mod q[i]
-# Output: RE vector of matrices ...                       p[i]
-
-function change_rns!(RE::Vector{Matrix{Float64}}, p::Vector{Int}, q::Vector{Int}, B::Vector{<:AbstractArray{Float64}}, U::Vector{Matrix{Float64}} = []) 
-  T = Float64
-  bd = Int[]  # mixed radix rep (wrt to q) of (prod(q)-1)/2
-  pr = mapreduce(ZZ, *, q)
-  pr = div(pr, 2)
-  for i = q
-    push!(bd, pr % i)
-    pr = divexact(pr - bd[end], i)
-  end
-
-  qinv = Int[0]
-  for i=2:length(B)
-    push!(qinv, invmod(Int(mapreduce(ZZ, *, q[1:i-1]) % q[i]), q[i]))
-  end
-  cp = [x-Int(mapreduce(ZZ, *, q) % x) for x = p]
-
-  if length(U) < 1
-    push!(U, copy(B[1]))
-  else
-    U[1] .= B[1]
-  end
-  @fastmath for i=2:length(B)
-    if length(U) < i
-      push!(U, copy(U[i-1]))
-    else
-      U[i] .= U[i-1]
-    end
-    for j=i-2:-1:1
-      BLAS.scal!((q[j] % T(q[i])), U[i])
-      BLAS.axpy!(1.0, U[j], U[i])
-      mod!(U[i], q[i])
-    end
-    BLAS.scal!(T((qinv[i]*(q[i]-1)) % q[i]), U[i])
-    BLAS.axpy!(T(qinv[i]), B[i], U[i])
-    mod!(U[i], q[i])
-  end
-
-  for j = 1:length(p)
-    if length(RE) < j
-      push!(RE, copy(U[length(B)]))
-    else
-      RE[j] .= U[length(B)]
-    end
-    mod!(RE[j], p[j])
-    @fastmath for i=length(B)-1:-1:1
-      BLAS.scal!(T(q[i] % p[j]), RE[j])
-      BLAS.axpy!(1.0, U[i], RE[j])
-      mod!(RE[j], p[j])
-    end
-  end
-
-  for i=1:length(RE[1])
-    for j=1:length(B)
-      if U[j][i] > bd[j]
-        for k=1:length(p)
-          RE[k][i] = (RE[k][i] + cp[k]) % p[k]
-        end
-        break
-      elseif U[j][i] < bd[j]
-        break
-      end
-    end
-  end
-  return RE
-end
-
-# Storjohann's unimodular certification
-# We use CRT to obtain the modular inverse B0
-# The p-adic Hensel-lifting version (below) seems to be faster
-function UniCertZ_CRT_rns(A::ZZMatrix)
-  n = nrows(A);
-  #assume ncols == n
-  H = hadamard_bound2(A)
-  EntrySize = maximum(nbits, A)
-  e = max(16, 2+ceil(Int, 2*log2(n)+EntrySize))
-  println("Modulus X has  $e bits");
-  
-  B0 = Matrix{Float64}[]
-  m = ZZ(1); p = 2^21;  # MAGIC NUMBER (initial prime, probably should be about 2^29?)
-  Xp = Int[]
-  @time while nbits(m) < e
-    p = next_prime(p);
-    push!(Xp, p)
-    ZZmodP = GF(p, cached = false);
-    MatModP = map_entries(ZZmodP, A)
-    B00 = inv_strassen(MatModP)
-    push!(B0, map(Float64, lift(B00)).entries)
-    Nemo.mul!(m, m, p)
-  end
-  println("Got modular inverses");
-  #m is the X
-
-  Ap = Matrix{Float64}[]
-  m_inv = fpFieldElem[]
-  Y = ZZ(1)
-  Yp = Int[]
-  while nbits(Y) < Int(1+ceil(log2(n)+EntrySize)) 
-    p = next_prime(p)
-    push!(Yp, p)
-    k = GF(p, cached = false)
-    push!(Ap, map(Float64, lift(map_entries(k, A))).entries)
-    push!(m_inv, inv(k(m)))
-    Nemo.mul!(Y, Y, p)
-  end
-
-  # Compute max num iters in k
-  k = (n-1)*(EntrySize+log2(n)/2) - (2*log2(n) + EntrySize);
-  k = k/log2(m);
-  k = 2+ceil(Int, log2(k));
-  println("Max num iters is k=$(k)");
-
-  t = similar(B0[1])
-  RY = Matrix{Float64}[]
-  U = Matrix{Float64}[]
-  change_rns!(RY, Yp, Xp, B0, U)
-  for i = 1:length(Ap)
-    x = Ap[i]
-    t = RY[i]
-    t .= -x*t
-    mod!(t, Yp[i])
-    for j=1:n
-      t[j,j] += 1.0
-    end
-    BLAS.scal!(Float64(lift(m_inv[i])), t)
-    mod!(t, Yp[i])
-  end
-
-  if all(iszero, RY)
-    return true
-  end
-
-  RX = map(similar, B0)
-  MY = map(similar, Ap)
-  R = similar(B0[1])
-  M = similar(B0[1])
-  T = similar(M)
-
-  MX = Matrix{Float64}[]
-
-  for i in 0:k-1
-    println("UniCertZ: loop: i=$(i)");
-    @time change_rns!(RX, Xp, Yp, RY, U)
-    @time for t = 1:length(RX)
-      BLAS.gemm!('n', 'n', 1.0, RX[t], RX[t], 0.0, M)
-      Nemo.mod!(M, Xp[t])
-      if length(MX) < t
-        push!(MX, B0[t] * M)
-      else
-        BLAS.gemm!('n', 'n', 1.0, B0[t], M, 0.0, MX[t])
-      end
-      Nemo.mod!(MX[t], Xp[t])
-    end
-    @time change_rns!(MY, Yp, Xp, MX, U)
-    @time for k=1:length(Ap)
-      S = RY[k]
-      M = similar(S)
-      BLAS.gemm!('n', 'n', 1.0, S, S, 0.0, M)
-#      change_rns2!(M, Yp[k], Xp, MX, U)
-#      S .= S*S
-      BLAS.gemm!('n', 'n', -1.0, Ap[k], MY[k], 1.0, M)
-#      S .-= Ap[k] * M
-      mod!(M, Yp[k])
-      BLAS.scal!(Float64(lift(m_inv[k])), M)
-      mod!(M, Yp[k])
-      RY[k], M = M, RY[k]
-    end
-
-    if all(iszero, RY)
-      return true
-    end
-  end #for
-  return false
-end
-
-
-# Storjohann's unimodular certification
-# We use CRT to obtain the modular inverse B0
-# The p-adic Hensel-lifting version (below) seems to be faster
-function UniCertZ_CRT(A::ZZMatrix)
-  n = nrows(A);
-  #assume ncols == n
-  H = hadamard_bound2(A)
-  EntrySize = infinity_norm(A)
-  e = max(16, Int(2+ceil(2*log2(n)+log2(EntrySize))));
-  println("Modulus has  $e bits");
-  
-  B0 = MatrixSpace(ZZ,n,n)(0); ## will be computed by crt in loop below
-  m = ZZ(1); p = 2^29;  # MAGIC NUMBER (initial prime, probably should be about 2^29?)
-  @time while (nbits(m) < e)
-    p = next_prime(p);
-    ZZmodP = GF(p, cached = false);
-    MatModP = map_entries(ZZmodP, A)
-    B00 = inv_strassen(MatModP)
-    induce_crt!(B0,m, B00,p, signed = true);  #  also updates:  m = m*p;
-  end
-  println("Got modular inverse");
-
-  # Compute max num iters in k
-  k = (n-1)*(log2(EntrySize)+log2(n)/2) - (2*log2(n) + log2(EntrySize));
-  k = k/log2(m);
-  k = 2+Int(ceil(log2(k)));
-  println("Max num iters is k=$(k)");
-
-  ZZmodM = residue_ring(ZZ,m);
-  B0_modm = map_entries(ZZmodM, B0);
-  I = MatrixSpace(ZZ,n,n)(1)
-  R = (I-A*B0)/m
-  if is_zero(R)
-    return true;
-  end
-  for i in 0:k-1
-    println("UniCertZ: loop: i=$(i)");
-    @time R_bar = R^2;
-    @time M = lift(B0_modm*map_entries(ZZmodM, R_bar));
-    Hecke.mod_sym!(M, m)
-    @time R = (R_bar - A*M)/m;
-    if is_zero(R)
-      return true
-    end
-  end #for
-  return false
-end
-
-function _det(a::fpMatrix)
-  a.r < 10 && return det(a)
-  #_det avoids a copy: det is computed destructively
-  r = ccall((:_nmod_mat_det, Nemo.libflint), UInt, (Ref{fpMatrix}, ), a)
-  return ZZ(r)
-end
-
-
-# Determinant algorithm: U is the range for random RHS matrix (default -100:100)
-# Answer is !!! CORRECT UP TO SIGN !!!
-function DetS(A::ZZMatrix, U::AbstractArray= -100:100)
-  n = ncols(A)
-  T = ZZMatrix[]
-  AT = A
-  d1 = ZZ(1) # will be the det
-
-  p = UInt(next_prime(2^60))
-  det_p = ZZ(0)
-  prod_p = ZZ(1)
-  Ainf = maximum(nbits, A)
-  Had = div(nbits(Hecke.hadamard_bound2(A)), 2)+1
-
-  #Dixon cost
-  # lifting has (at least) Had/60 many steps (as the numerator will be
-  # of maximal size)
-  # Each step uses (Ainf+60+n)/60 many modular products
-  #                +1 for lifting
-  #so that should be the number of quadratic steps
-  #
-  function dixon_cost(::fmpz)
-    ceil(Int, (((Ainf+60+n)/60 + 1) * Had/60)/n)
-  end
-  #this should estimate the number of modular dets that will cost
-  #as much as the next dixon step.
-  #if we are close enough to Had, we move to crt.
-  function crt_cost(d::fmpz)
-    return ceil(Int, (Had-nbits(d))/60)
-  end
-  function hnf_cost(d::fmpz)
-    ceil(Int, nbits(d)/60)
-  end
-  function uni_cost(d::fmpz)
-    ceil(Int, log(n)*(2*n+Ainf)/60)
-  end
-
-  f = true
-  local Ap::fpMatrix
-  small = Ainf+n
-  @time while nbits(prod_p) < small + 20
-    if f
-      f = false
-      Ap = map_entries(Nemo.fpField(p, false), A)
-    else
-      change_prime(Ap, p)
-      map_entries!(Ap, Nemo.fpField(p, false), A)
-    end
-    d = _det(Ap)
-    if det_p == 0
-      det_p = d
-    else
-      det_p = crt(det_p, prod_p, d, ZZ(p))
-    end
-    Nemo.mul!(prod_p, prod_p, p)
-    p = next_prime(p)
-  end
-  det_p = mod_sym(det_p, prod_p)
-  det_small = false
-  if nbits(prod_p) - nbits(det_p) > 30
-    @show "det small", det_p
-    det_small = true
-  end
-
-  local D::DixonCtx
-  f = true
-  while !det_small
-    b = rand(matrix_space(ZZ,n,1),U); 
-    if f
-      D = dixon_init(A, b)
-      f = false
-    end
-    @time TS, d = dixon_solve(D, b)
-    for i=1:length(T)
-      TS = T[i]*TS
-    end
-    c = content(TS)
-    g = gcd(c, d)
-    divexact!(TS, TS, g)
-    d = divexact(d, g)
-    println("DetS: loop: nbits(d)=$(nbits(d))");
-    global last_bad = b, T, d
-    @assert !isone(d)  #should never happen, the det_p should catch that
-    isone(d) && break
-    d1 *= d
-    @show  (Had - nbits(d1) - nbits(prod_p))/60
-    if Had - nbits(d1) < nbits(prod_p)
-      @show "at H-bound",  (Had - nbits(d1) - nbits(prod_p))/60
-      return d1
-    end
-    if (Had - nbits(d1) - nbits(prod_p))/60 < dixon_cost(d)
-      @show "finishing with CRT"
-      while nbits(prod_p)+nbits(d1) < Had
-        change_prime(Ap, p)
-        map_entries!(Ap, Nemo.fpField(p, false), A)
-        d = _det(Ap)
-        det_p = crt(det_p, prod_p, d, ZZ(p))
-        Nemo.mul!(prod_p, prod_p, ZZ(p))
-      end
-      mis = mod_sym(det_p*invmod(d1, prod_p), prod_p)
-      return d1*mis
-    end
-    @time T1 = hcol(TS, d)  
-    push!(T, T1)
-    @show :solve
-    @time AT = solve_triu_strassen(T1, AT)
-    @show nbits(prod_p), nbits(d1)
-    @show nbits(abs(mod_sym(invmod(d1, prod_p)*det_p, prod_p)))
-    if nbits(abs(mod_sym(invmod(d1, prod_p)*det_p, prod_p))) < small 
-      break
-    end
-    @show nbits(d), Ainf
-    if nbits(d) < Ainf + n
-      @show :doin_hnf
-      global last_hnf = (AT, d)
-      h = hnf_modular_eldiv(AT, d)
-      d1 *= prod([h[i,i] for i=1:n])
-    @show Had / nbits(d1)  
-      AT = solve_triu_strassen(h, AT)
-      if nbits(abs(mod_sym(invmod(d1, prod_p)*det_p, prod_p))) < small
-        break
-      end
-    end
-  end
-  det_p = invmod(d1, prod_p)*det_p
-  @show det_p = mod_sym(det_p, prod_p)
-  @assert nbits(abs(det_p)) < small
-  h = hnf_modular_eldiv(AT, det_p)
-  @show det(h) // det_p, det(h)
-  d1 *= det(h)
-  AT = solve_triu_strassen(h, AT)
-    println("DOING UNICERTZ");
-    @show uni_cost(d1)
-    @show crt_cost(d1)
-    #need to cost this as well
-    global last_AT = AT
-    @time fl = UniCertZ_CRT(AT)
-    #           @assert fl == is_unit(det(ZAT));
-    if fl
-      return d1
-    end
-    return d1
-    error("unimod failed")
-end
-
-#adaptive rational_reconstruction: if solution is unbalanced with
-#denominator smaller than numerator
-function ratrec(a::ZZMatrix, b::ZZRingElem)
-  A = similar(a)
-  D = ZZ(1)
-  T = ZZ()
-  n = ZZ()
-  d = ZZ()
-  for i=1:nrows(a)
-    a_ptr = Nemo.mat_entry_ptr(a, i, 1)
-    A_ptr = Nemo.mat_entry_ptr(A, i, 1)
-    for j=1:ncols(a)
-#      @show i, j
-      ccall((:fmpz_set, Nemo.libflint), Cvoid, (Ref{fmpz}, Ptr{fmpz}), T, a_ptr)
-      Nemo.mul!(T, T, D)
-      Nemo.mod!(T, T, b)
-      fl = ratrec!(n, d, T, b)
-      fl || return fl, A, D
-      if !isone(d)
-        D = D*d
-        Nemo.mul!(A, A, d)
-      end
-      ccall((:fmpz_set, Nemo.libflint), Cvoid, (Ptr{fmpz}, Ref{fmpz}), A_ptr, n)
-
-      a_ptr += sizeof(ZZRingElem)
-      A_ptr += sizeof(ZZRingElem)
-    end
-  end
-  return true, A, D
-end
-
-function Nemo.div!(a::ZZRingElem, b::ZZRingElem, c::ZZRingElem)
-  ccall((:fmpz_tdiv_q, Nemo.libflint), Cvoid, (Ref{ZZRingElem}, Ref{ZZRingElem}, Ref{ZZRingElem}), a, b, c)
-end
-
-function shift_right!(a::ZZRingElem, b::ZZRingElem, i::Int)
-  ccall((:fmpz_fdiv_q_2exp, Nemo.libflint), Cvoid, (Ref{fmpz}, Ref{fmpz}, Int), a, b, i)
-end
-
-function ratrec(a::ZZRingElem, b::ZZRingElem)
-  n = ZZ()
-  d = ZZ()
-  fl = ratrec!(n, d, a, b)
-  return fl, n, d
-end
-
-function ratrec!(n::ZZRingElem, d::ZZRingElem, a::ZZRingElem, b::ZZRingElem)
-  k = nbits(b)
-  l = 1
-  N = deepcopy(b)
-  D = ZZ(2)
-
-#  @assert 0<a<b
-  done = false
-  while !done && D <= N
-    Nemo.mul!(D, D, D)
-    div!(N, b, D)
-    shift_right!(N, N, 1)
-    if D>N
-      D = N = iroot(b, 2)
-      D = div(D, 2)
-      done = true
-    end
-
-#    @assert 2*N*D < b
-
-    fl = ccall((:_fmpq_reconstruct_fmpz_2, Nemo.libflint), Bool, (Ref{fmpz}, Ref{fmpz}, Ref{fmpz}, Ref{fmpz}, Ref{fmpz}, Ref{fmpz}), n, d, a, b, N, D)
-
-    if fl && (nbits(n)+nbits(d) < k - 30 || D>N)
-      return fl
-    end
-    l += 1
-  end
-  return false
-end
-
-function Hecke.map_entries(k::Nemo.fpField, A::ZZMatrix)
-  a = zero_matrix(k, nrows(A), ncols(A))
-  return map_entries!(a, k, A)
-end
-
-function map_entries!(a::fpMatrix, k::Nemo.fpField, A::ZZMatrix)
-  ccall((:fmpz_mat_get_nmod_mat, Nemo.libflint), Cvoid, (Ref{fpMatrix}, Ref{ZZMatrix}), a, A)
-  return a
-end
-
-function change_prime(a::fpMatrix, p::UInt)
-  a.n = p  #in flint 3.0 available as nmod_mat_set_mod
-  a.norm = leading_zeros(p)
-  a.ninv = ccall((:n_preinvert_limb_prenorm, Nemo.libflint), UInt, (UInt, ), p << a.norm)
-end
-
-function lift!(A::ZZMatrix, a::fpMatrix)
-  ccall((:fmpz_mat_set_nmod_mat, Nemo.libflint), Cvoid, (Ref{ZZMatrix}, Ref{fpMatrix}), A, a)
-end
-
-mutable struct DixonCtx
-  bound::ZZRingElem
-  Ainv::fpMatrix
-  p::UInt
-  crt_primes::Vector{UInt}
-  A_mod::Vector{fpMatrix}
-  d_mod::fpMatrix
-  y_mod::fpMatrix
-  Ay_mod::fpMatrix
-  Ay::ZZMatrix
-  x::ZZMatrix
-  d::ZZMatrix
-  A::ZZMatrix
-  function DixonCtx()
-    return new()
-  end
-end
-#copied from flint to allow the use of adaptive reconstruction,
-#support cases with small primes and Float64
-function dixon_init(A::ZZMatrix, B::ZZMatrix)
-  D = DixonCtx()
-  D.A = A
-
-  n = nrows(A)
-  cols = ncols(B)
-
-  _N = ZZ()
-  _D = ZZ()
-  ccall((:fmpz_mat_solve_bound, Nemo.libflint), Cvoid, (Ref{fmpz}, Ref{fmpz}, Ref{ZZMatrix}, Ref{ZZMatrix}), _N, _D, A, B)
-  Ainv = zero_matrix(GF(2, cached = false), n, n)
-  p = ccall((:fmpz_mat_find_good_prime_and_invert, Nemo.libflint), UInt, (Ref{fpMatrix}, Ref{ZZMatrix}, Ref{fmpz}), Ainv, A, _D)
-  @assert p != 0
-  D.p = p
-  D.Ainv = Ainv
-
-  D.bound = 2*maximum(abs, [_D, _N])^2 * 2^30
-  D.crt_primes = UInt[]
-  D.A_mod = fpMatrix[]
- 
-  pr = ZZ(1)
-  xp = p
-  maxA = maximum(abs, A) *(p-1)*n*2
-
-  while true
-    push!(D.crt_primes, xp)
-    push!(D.A_mod, map_entries(Nemo.fpField(xp, false), A))
-    pr *= xp
-    if pr > maxA
-      break
-    end
-    xp = next_prime(xp)
-  end
-
-  k = Nemo.fpField(p, false)
-  D.d_mod = zero_matrix(k, nrows(B), ncols(B))
-  D.y_mod = zero_matrix(k, nrows(B), ncols(B))
-  D.Ay_mod = zero_matrix(k, nrows(B), ncols(B))
-  D.Ay = zero_matrix(ZZ, nrows(B), ncols(B))
-  D.x = zero_matrix(ZZ, nrows(B), ncols(B))
-
-  return D
-end
-
-function dixon_solve(D::DixonCtx, B::ZZMatrix)
-  zero!(D.x)
-  d = deepcopy(B)
-  ppow = ZZ(1)
-  i = 1
-  nexti = 1
-  while ppow <= D.bound
-    map_entries!(D.d_mod, Nemo.fpField(D.p, false), d)
-    Nemo.mul!(D.y_mod, D.Ainv, D.d_mod)
-
-    ccall((:fmpz_mat_scalar_addmul_nmod_mat_fmpz, Nemo.libflint), Cvoid, (Ref{ZZMatrix}, Ref{fpMatrix}, Ref{fmpz}), D.x, D.y_mod, ppow)
-
-    Nemo.mul!(ppow, ppow, D.p)
-    if ppow > D.bound
-      break
-    end
-
-    stabilised = i == nexti;
-
-    if stabilised
-      nexti = ceil(Int,(i*1.4)) + 1;
-      fl, num, den = ratrec(D.x, ppow)
-
-      if fl
-        # fl = (D.A*num == den*B)
-        sz = max(maximum(nbits, D.A) + maximum(nbits, num) 
-                                     + nbits(ncols(B)) + 1, 
-                 maximum(nbits, B) + nbits(den))
-        if sz < nbits(ppow)
-          @show "no prod neccessary"
-        else
-          xp = next_prime(D.p)
-          @show ceil(Int, (sz - nbits(ppow))/60)
-          for i=1:ceil(Int, (sz - nbits(ppow))/60)
-            k = Nemo.fpField(xp, false)
-            Ap = map_entries(k, D.A)
-            Bp = map_entries(k, B)
-            nump = map_entries(k, num)
-            fl = Ap*nump == Bp*k(den)
-            fl || break
-            xp = next_prime(xp)
-          end
-        end
-        @show fl
-        fl && return num, den
-      end
-    end
-  
-    i += 1
-#    i % 100 == 0 && @show i
-#    i % 1000 == 0 && return
-
-    prod = ZZ(1)
-    if false
-      Nemo.mul!(D.Ay, D.A, lift(D.y_mod))
-    else
-      for j=1:length(D.crt_primes)
-        change_prime(D.y_mod, D.crt_primes[j])
-        change_prime(D.Ay_mod, D.crt_primes[j])
-
-        Nemo.mul!(D.Ay_mod, D.A_mod[j], D.y_mod)
-        if j == 1
-          lift!(D.Ay, D.Ay_mod)
-          prod = ZZ(D.crt_primes[j])
-        else
-          induce_crt!(D.Ay, prod, D.Ay_mod, D.crt_primes[j], signed = true)
-        end
-      end
-      change_prime(D.y_mod, D.p)
-    end
-    sub!(d, d, D.Ay)
-    divexact!(d, d, ZZ(D.p))
-  end
-  fl, num, den = ratrec(D.x, ppow)
-
-  @assert fl
-#  @time @assert D.A*num == den*B
-  return num, den
-end
-
-
-#= example
-
- A=rand(MatrixSpace(ZZ,10,10),1:10);
- DetS(A, -10:10)
-
- A=rand(MatrixSpace(ZZ,300,300),10:100);
- @time DetS(A)
-
-=#
-
+#see AbstractAlgebra.Strassen for an MatElem version
 
 end # module

From 6fa14b2b517faf04f034bb70c2f734734b6d5623 Mon Sep 17 00:00:00 2001
From: Claus Fieker <fieker@mathematik.uni-kl.de>
Date: Thu, 30 Nov 2023 19:41:19 +0100
Subject: [PATCH 5/5] use solve_triu_left... th Strassen is still missing for
 it

---
 examples/Det.jl | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/examples/Det.jl b/examples/Det.jl
index 8eb58d047f..fe299297e6 100644
--- a/examples/Det.jl
+++ b/examples/Det.jl
@@ -359,10 +359,10 @@ function UniCertZ_CRT_rns(A::ZZMatrix)
   Xp = Int[]
   @time while nbits(m) < e
     p = next_prime(p);
-    push!(Xp, p)
     ZZmodP = GF(p, cached = false);
     MatModP = map_entries(ZZmodP, A)
     B00 = inv_strassen(MatModP)
+    push!(Xp, p)
     push!(B0, map(Float64, lift(B00)).entries)
     Nemo.mul!(m, m, p)
   end
@@ -507,6 +507,9 @@ function _det(a::fpMatrix)
   return ZZ(r)
 end
 
+function t_det(h::ZZMatrix)
+  return Hecke.prod_diagonal(h)
+end
 
 # Determinant algorithm: U is the range for random RHS matrix (default -100:100)
 # Answer is !!! CORRECT UP TO SIGN !!!
@@ -578,10 +581,13 @@ function DetS(A::ZZMatrix, U::AbstractArray= -100:100; use_rns::Bool = false)
   while !det_small
     b = rand(matrix_space(ZZ,n,1),U); 
     if f
+      @show :solve_init
       D = dixon_init(A, b)
       f = false
     end
+    @show :solving
     @time TS, d = dixon_solve(D, b)
+    @assert D*TS == d*b
     for i=1:length(T)
       TS = T[i]*TS
     end
@@ -614,7 +620,7 @@ function DetS(A::ZZMatrix, U::AbstractArray= -100:100; use_rns::Bool = false)
     @time T1 = hcol(TS, d)  
     push!(T, T1)
     @show :solve
-    @time AT = Strassrn.solve_triu(T1, AT)
+    @time AT = Strassen.solve_triu(T1, AT)
 #    @show nbits(prod_p), nbits(d1)
 #    @show nbits(abs(mod_sym(invmod(d1, prod_p)*det_p, prod_p)))
     if nbits(abs(mod_sym(invmod(d1, prod_p)*det_p, prod_p))) < small 
@@ -626,7 +632,7 @@ function DetS(A::ZZMatrix, U::AbstractArray= -100:100; use_rns::Bool = false)
       h = hnf_modular_eldiv(AT, d)
       d1 *= prod([h[i,i] for i=1:n])
     @show Had / nbits(d1)  
-      AT = Strassen.solve_triu(h, AT)
+      AT = Nemo.solve_triu_left(h, AT)
       if nbits(abs(mod_sym(invmod(d1, prod_p)*det_p, prod_p))) < small
         break
       end
@@ -635,11 +641,12 @@ function DetS(A::ZZMatrix, U::AbstractArray= -100:100; use_rns::Bool = false)
   det_p = invmod(d1, prod_p)*det_p
   @show det_p = mod_sym(det_p, prod_p)
   @assert nbits(abs(det_p)) < small
-  h = hnf_modular_eldiv(AT, det_p)
-  @show det(h) // det_p, det(h)
-  d1 *= det(h)
+  @show :hnf
+  @time h = hnf_modular_eldiv(AT, det_p)
+  @show t_det(h) // det_p, det(h)
+  d1 *= t_det(h)
 
-  AT = Strassen.solve_triu(h, AT)
+  @time AT = Nemo.solve_triu_left(h, AT)
     println("DOING UNICERTZ");
     @show uni_cost(d1)
     @show crt_cost(d1)