diff --git a/optim_mixture.jl b/optim_mixture.jl
index 8b74cf4..2fca757 100644
--- a/optim_mixture.jl
+++ b/optim_mixture.jl
@@ -1,40 +1,38 @@
 module OptimMixture
 
+import GLMakie as GLM
 import DelimitedFiles as DF
 import BenchmarkTools as BT
 import UnicodePlots as UP
-import LinearAlgebra: ⋅
+import NLopt
+import LinearAlgebra: norm, mul!, axpy!, axpby!
+
+import Logging
+Logging.global_logger(Logging.ConsoleLogger(stderr, Logging.Info))
 
 const ENERGIES = [2803, 2811, 2819, 2826, 2834, 2842, 2850, 2858, 2866, 2874,
   2882, 2890, 2897, 2905, 2913, 2921, 2929, 2937, 2945, 2953,
   2961, 2969, 2977, 2985, 2993, 3001, 3009, 3018, 3026, 3034,
   3042, 3050]
 
-@kwdef struct Slab
+@kwdef struct Slab{T<:Real}
   energy::Int
-  data::Matrix{Int}
+  data::Matrix{T}
 end
 
-function load_slabs(width::Int=512, height::Int=512)
-  slabs = Slab[]()
+function load_slabs(; T::Type=Int, width::Int=512, height::Int=512)
+  slabs = Slab[]
   i_min, i_max = typemax(Int), 0
   for e in ENERGIES
-    slab = Slab(energy=e, data=DF.readdlm("test_sample/HeLa_F-SRS_512x512_2803cm-1.txt", ',', Int))
+    slab::Slab{T} = Slab{T}(energy=e, data=DF.readdlm("test_sample/HeLa_F-SRS_512x512_$(e)cm-1.txt", ',', T))
     push!(slabs, slab)
 
     i_min, i_max = min(i_min, minimum(slab.data)), max(i_max, maximum(slab.data))
 
-    return slabs, i_min, i_max
   end
-end
-
-function Q_matrix_transpose(X::Matrix{Float64}, Y::Matrix{Float64}, N::Int)
-  return sum(abs2, Y - view(X, 1:N, :) * view(X, (N+1):size(X, 1), :)')
-end
-
-function Q_matrix_reshape(X::Matrix{Float64}, Y::Matrix{Float64}, N::Int)
-  N_plus_M, m = size(X)
-  return sum(abs2, Y - view(X, 1:N, :) * reshape(view(X, (N+1):N_plus_M, :), m, N_plus_M - N))
+  s = join(size(slabs[1].data), "x")
+  @info "Loaded $(length(slabs)) $s slabs"
+  return slabs, i_min, i_max
 end
 
 function E_loop(X::Matrix{Float64}, Y::Matrix{Float64})
@@ -53,8 +51,49 @@ function E_loop(X::Matrix{Float64}, Y::Matrix{Float64})
   return s
 end
 
+function E!(X::Matrix{Float64}, Y::Matrix{Float64}, tmp::Matrix{Float64})
+  N, M = size(Y)
+  Q!(tmp, X, N)
+  # roughly 6ms with N = 512*512, M = 32, m = 2
+  # slighly faster than tmp .= tmp .- Y (7ms)
+  # slighly faster than tmp .-= Y (7ms)
+  axpy!(-1.0, Y, tmp)
+  # roughly 2.5m
+  # slightly faster than norm2(tmp)
+  s = sum(x -> x^2, tmp)
+  return s
+end
+
+function test_E(N::Int, M::Int, m::Int; benchmark::Bool=true)
+  X = rand(N + M, m)
+  Y = rand(N, M)
+
+  tmp = zeros(N, M)
+
+  E_from_loop = E_loop(X, Y)
+  E_from_mul = E!(X, Y, tmp)
+
+  @show abs(E_from_mul - E_from_loop)
+
+  if benchmark
+    @info "Running benchmarks..., this may take a while"
+    show(stdout, "text/plain", BT.@benchmark E_loop($X, $Y))
+    println()
+    show(stdout, "text/plain", BT.@benchmark E!($X, $Y, $tmp))
+  end
+end
+
+
+function Q!(dst::Matrix{Float64}, X::Matrix{Float64}, N::Int)
+  N_plus_M, _ = size(X)
+  # roughly 9ms with N = 512*512, M = 32, m = 2
+  # could be sped up using StrideArray?
+  mul!(dst, view(X, 1:N, :), view(X, N+1:N_plus_M, :)')
+end
+
 function Q_loop!(dst::Matrix{Float64}, X::Matrix{Float64}, N::Int)
   N_plus_M, m = size(X)
+  # roughly 23.5ms with N = 512*512, M = 32, m = 2
   for i in 1:N
     for j in N+1:N_plus_M
       x = 0
@@ -66,58 +105,43 @@ function Q_loop!(dst::Matrix{Float64}, X::Matrix{Float64}, N::Int)
   end
 end
 
+function test_Q(N::Int, M::Int, m::Int; benchmark::Bool=true)
+  X = rand(N + M, m)
+
+  Q_from_loop = zeros(N, M)
+  Q_from_mul = zeros(N, M)
+
+  Q!(Q_from_mul, X, N)
+  Q_loop!(Q_from_loop, X, N)
+
+  @show norm(Q_from_mul - Q_from_loop)
+
+  if benchmark
+    @info "Running benchmarks..., this may take a while"
+    show(stdout, "text/plain", BT.@benchmark Q!($Q_from_mul, $X, $N))
+    println()
+    show(stdout, "text/plain", BT.@benchmark Q_loop!($Q_from_loop, $X, $N))
+  end
+end
+
 function DE_loop!(dst::Matrix{Float64}, X::Matrix{Float64}, Y::Matrix{Float64})
   dst_W = zeros(size(Y))
-  DE_loop!(dst, dst_W, X, Y)
+  DE_loop!!(dst, dst_W, X, Y)
 end
 
-function dot_prod_dual_loop(W::Matrix{Float64}, X::Matrix{Float64}, V::Matrix{Float64})
-  _, m = size(X)
-  N, M = size(W)
-
-  s = 0
-
-  for k in 1:m
-    for i in 1:N
-      x = 0
-      for j in 1:M
-        @inbounds x += W[i, j] * X[N+j, k]
-      end
-      s += x * V[i, k]
-    end
-    for j in 1:M
-      x = 0
-      for i in 1:N
-        @inbounds x += W[i, j] * X[i, k]
-      end
-      s += x * V[N+j, k]
-    end
-  end
-
-  return s
+function DE!!(dst::Matrix{Float64}, dst_W::Matrix{Float64}, X::Matrix{Float64}, Y::Matrix{Float64})
+  N_plus_M, m = size(X)
+  N, M = size(Y)
+  # dst is the same size as X
+  # we need storage of size Y
+  Q!(dst_W, X, N)
+  # compute W = 2(Q(X) - Y)
+  axpby!(-2.0, Y, 2.0, dst_W)
+  mul!(view(dst, 1:N, :), dst_W, view(X, N+1:N_plus_M, :))
+  mul!(view(dst, N+1:N_plus_M, :), dst_W', view(X, 1:N, :))
 end
 
-function dot_prod_primal_loop(W::Matrix{Float64}, X::Matrix{Float64}, V::Matrix{Float64})
-  _, m = size(X)
-  N, M = size(W)
-
-  s = 0
-
-  for i in 1:N
-    for j in 1:M
-      x = 0
-      for k in 1:m
-        @inbounds x += X[i, k] * V[N+j, k] + X[N+j, k] * V[i, k]
-
-      end
-      s += x * W[i, j]
-    end
-  end
-
-  return s
-end
-
-function DE_loop!(dst::Matrix{Float64}, dst_W::Matrix{Float64}, X::Matrix{Float64}, Y::Matrix{Float64})
+function DE_loop!!(dst::Matrix{Float64}, dst_W::Matrix{Float64}, X::Matrix{Float64}, Y::Matrix{Float64})
   _, m = size(X)
   N, M = size(Y)
   # dst is the same size as X
@@ -144,116 +168,162 @@ function DE_loop!(dst::Matrix{Float64}, dst_W::Matrix{Float64}, X::Matrix{Float6
   dst .*= 2
 end
 
-function DQ_V_loop!(dst::Matrix{Float64}, X::Matrix{Float64}, V::Matrix{Float64}, N::Int)
-  N_plus_M, m = size(X)
-  for i in 1:N
-    for j in N+1:N_plus_M
-      x = 0
-      for k in 1:m
-        @inbounds x += (V[i, k] * X[j, k] + X[i, k] * V[j, k])
-      end
-      @inbounds dst[i, j-N] = x
-    end
-  end
-end
-
-function Q_loop_transposed(X::Matrix{Float64}, Y::Matrix{Float64}, N::Int)
-  s = 0
-  m, N_plus_M = size(X)
-  for i in 1:N
-    for j in N+1:N_plus_M
-      x = 0
-      for k in 1:m
-        @inbounds x += X[k, i] * X[k, j]
-      end
-      @inbounds s += (Y[j-N, i] - x)^2
-    end
-  end
-  return s
-end
-
-function test_dot_prod(N::Int, M::Int, m::Int)
-  X = rand(N + M, m)
-  W = rand(N, M)
-  V = rand(N + M, m) .- 0.5
-
-  @show dot_prod_primal_loop(W, X, V)
-  @show dot_prod_dual_loop(W, X, V)
-end
-
-function test_DE(N::Int, M::Int, m::Int)
+function check_derivative_E(N::Int, M::Int, m::Int; loop::Bool=false)
   X = rand(N + M, m)
   Y = rand(N, M)
   V = rand(N + M, m) .- 0.5
 
   EX = E_loop(X, Y)
   grad_EX = zeros(size(X))
-  DE_loop!(grad_EX, X, Y)
+  tmp = zeros(size(Y))
+  if loop
+    DE_loop!!(grad_EX, tmp, X, Y)
+  else
+    DE!!(grad_EX, tmp, X, Y)
+  end
 
   eps = [10.0^k for k in 1:-1:-12]
   errors = zeros(size(eps))
 
   for i in eachindex(eps)
-    EX_V_true = E_loop(X + eps[i] * V, Y)
+    if loop
+      EX_V_true = E_loop(X + eps[i] * V, Y)
+    else
+      EX_V_true = E!(X + eps[i] * V, Y, tmp)
+    end
     EX_V_approx = EX + eps[i] * sum(grad_EX .* V)
-    @show EX_V_true
-    @show EX_V_approx
+    # @show EX_V_true
+    # @show EX_V_approx
     errors[i] = abs(EX_V_true - EX_V_approx)
   end
 
-  @show eps
-  @show errors
+  # @show eps
+  # @show errors
 
   foo = UP.lineplot(eps, errors, xscale=:log10, yscale=:log10)
   UP.lineplot!(foo, eps, eps)
   println(foo)
 end
 
-function test_DQ(N::Int, M::Int, m::Int)
-  X = rand(N + M, m) * 255
-  V = rand(N + M, m)
+function test_DE(N::Int, M::Int, m::Int; benchmark::Bool=true)
+  X = rand(N + M, m)
+  Y = rand(N, M)
 
-  QX = zeros(N, M)
-  Q_loop!(QX, X, N)
+  tmp_W = zeros(N, M)
 
-  dst_true = zeros(N, M)
-  dst_approx = zeros(N, M)
+  ∇E_loop = zeros(N + M, m)
+  ∇E = zeros(N + M, m)
 
-  eps = [10.0^k for k in 1:-1:-8]
-  errors = zeros(size(eps))
+  DE_loop!!(∇E_loop, tmp_W, X, Y)
+  DE!!(∇E, tmp_W, X, Y)
 
-  for i in eachindex(eps)
-    Q_loop!(dst_true, X + eps[i] * V, N)
-    DQ_V_loop!(dst_approx, X, eps[i] * V, N)
-    dst_approx .+= QX
-    errors[i] = sum(abs, dst_approx - dst_true)
+  @show norm(∇E_loop - ∇E)
+
+  if benchmark
+    @info "Running benchmarks..., this may take a while"
+    show(stdout, "text/plain", BT.@benchmark DE_loop!!($∇E_loop, $tmp_W, $X, $Y))
+    println()
+    show(stdout, "text/plain", BT.@benchmark DE!!($∇E, $tmp_W, $X, $Y))
+  end
+end
+
+function benchmark()
+  slabs, _, _ = load_slabs(T=Float64)
+  Y::Matrix{Float64} = hcat([vec(slab.data) for slab in slabs]...)
+  @show size(Y)
+
+  N::Int, M::Int = size(Y)
+  m::Int = 2
+
+  X::Vector{Float64} = fill(0.5, (N + M) * m)
+  ∇E::Vector{Float64} = zeros((N + M) * m)
+  tmp = zeros(size(Y))
+
+  show(stdout, "text/plain", BT.@benchmark DE_loop!!(reshape($∇E, $N + $M, $m), $tmp, reshape($X, $N + $M, $m), reshape($Y, $N, $M)))
+  println()
+  show(stdout, "text/plain", BT.@benchmark E_loop(reshape($X, $N + $M, $m), reshape($Y, $N, $M)))
+end
+
+function plot_result_mixture_1(X::Vector{Float64}, N::Int, M::Int, m::Int)
+  X_ = reshape(X, N + M, m)
+  λ = view(X_, 1:N, :)
+  s = view(X_, N+1:N+M, :)
+
+  @show size(λ)
+  @show size(s)
+
+  fig = GLM.Figure()
+  axes = map(1:m+1) do i
+    aspect = if i < m + 1
+      GLM.DataAspect()
+    else
+      nothing
+    end
+    GLM.Axis(fig[1, i], aspect=aspect)
   end
 
-  @show eps
-  @show errors
 
-  println(UP.lineplot(eps, errors, xscale=:log10, yscale=:log10))
+  n = Int(sqrt(N))
+
+  # Component density
+  GLM.image!(axes[1], reshape(view(λ, :, 1), n, n))
+  GLM.image!(axes[2], reshape(view(λ, :, 2), n, n))
+
+  # Spectra
+  GLM.lines!(axes[3], s[:, 1])
+  GLM.lines!(axes[3], s[:, 2])
+
+  @show norm(λ[:, 1] - λ[:, 2])
+  @show norm(s[:, 1] - s[:, 2])
+
+  display(fig)
 end
 
-function test(N::Int, M::Int, m::Int)
-  Y = rand(N, M) * 255
-  X = rand(N + M, m) * 255
+function optim(algo::Symbol=:LD_MMA)
+  slabs, _, _ = load_slabs(T=Float64)
+  Y::Matrix{Float64} = hcat([vec(slab.data) for slab in slabs]...)
+  @show size(Y)
 
-  X_pre = copy(X)
-  X_pre[N+1:N+M] .= vec(X_pre[N+1:N+M]')
+  N::Int, M::Int = size(Y)
+  m::Int = 2
 
-  r_matrix = Q_matrix_transpose(X, Y, N)
-  r_loop = Q_loop(X, Y, N)
+  X::Vector{Float64} = fill(0.5, (N + M) * m)
+  ∇E::Vector{Float64} = zeros((N + M) * m)
+  tmp = zeros(size(Y))
 
-  @show r_matrix, r_loop, abs(r_matrix - r_loop) / r_loop
+  function objective(X::Vector{Float64}, grad::Vector{Float64})
+    if length(grad) > 0
+      DE!!(reshape(grad, N + M, m), tmp, reshape(X, N + M, m), reshape(Y, N, M))
+    else
+      @debug "No gradient requested!"
+    end
+    # @show "|∇E| = $(norm(grad))"
+    max_Σ_λ = maximum(sum(view(reshape(X, N + M, m), 1:N, :); dims=2))
+    value = E!(reshape(X, N + M, m), reshape(Y, N, M), tmp)
+    @info "E = $value, |∇E| = $(norm(grad)), max_Σ_λ = $max_Σ_λ"
+    return value
+  end
+
+  opt = NLopt.Opt(algo, (N + M) * m)
+  NLopt.lower_bounds!(opt, fill(0.0, (N + M) * m))
+  NLopt.upper_bounds!(opt, vec([fill(1.0, N, m); fill(Inf, M, m)]))
+  NLopt.min_objective!(opt, objective)
+
+  res = NLopt.optimize(opt, X)
+
+  return res
 end
 
+
 end # module
 
-# OptimMixture.test(512 * 512, 20, 2)
-# OptimMixture.test_DQ(512 * 512, 20, 2)
-OptimMixture.test_DE(512 * 512, 20, 2)
-# OptimMixture.test_DE(20, 2, 2)
-# OptimMixture.test_dot_prod(512^2, 20, 2)
+res = OptimMixture.optim(:LD_LBFGS);
+OptimMixture.plot_result_mixture_1(res[2], 512 * 512, 32, 2)
+# OptimMixture.benchmark();
+# OptimMixture.test_E(512 * 512, 32, 2)
+# OptimMixture.test_DE(512 * 512, 32, 2)
+# OptimMixture.test_Q(512 * 512, 32, 2)
+# OptimMixture.check_derivative_E(512 * 512, 32, 2)
+# OptimMixture.check_derivative_E(512 * 512, 32, 2; loop=true)
 
 # vim: ts=2:sw=2:sts=2