diff --git a/experiments/mipmap/DiffCoopVec.slang b/experiments/mipmap/DiffCoopVec.slang
new file mode 100644
index 0000000..ec90a76
--- /dev/null
+++ b/experiments/mipmap/DiffCoopVec.slang
@@ -0,0 +1,272 @@
+typealias IReal = __BuiltinFloatingPointType;
+
+// This class wraps around a CoopVec to make it differentiable. This is a temporary
+// workaround until the Slang core library supplies a differentiable CoopVec.
+struct DiffCoopVec<T : IReal, int N> : IDifferentiable, IArray<T>, IArithmetic
+{
+    typealias Differential = DiffCoopVec<T, N>;
+
+    static const CoopVecComponentType ComponentType =
+        (T is half) ? CoopVecComponentType.Float16 :
+        (T is float) ? CoopVecComponentType.Float32 :
+        CoopVecComponentType.Float64;
+
+    CoopVec<T, N> cv;
+
+    [BackwardDifferentiable] __init() { this = fill(T(0.0f)); }
+    [BackwardDifferentiable] __init(T x) { this = fill(x); }
+    [BackwardDifferentiable] __init<S : IReal>(S x) { this = fill(x); }
+    [BackwardDifferentiable] __init(This x) { this = x; }
+    [BackwardDifferentiable] __init<S : IReal>(DiffCoopVec<S, N> x) { cv = CoopVec<T, N>(x.cv); }
+    __init(no_diff CoopVec<T, N> x) { cv = x; }
+
+    int getCount()
+    {
+        return N;
+    }
+
+    __subscript(int index) -> T
+    {
+        [BackwardDifferentiable] get { return indexRead(this, index); }
+        [BackwardDifferentiable] set { indexWrite(this, index, newValue); }
+    }
+
+    bool equals(This other) { return cv.equals(other.cv); }
+    bool lessThan(This other) { return cv.lessThan(other.cv); }
+    bool lessThanOrEquals(This other) { return cv.lessThanOrEquals(other.cv); }
+    [BackwardDifferentiable] This add(This other) { return add(this, other); }
+    [BackwardDifferentiable] This sub(This other) { return sub(this, other); }
+    [BackwardDifferentiable] This mul(This other) { return mul(this, other); }
+    [BackwardDifferentiable] This div(This other) { return div(this, other); }
+    [BackwardDifferentiable] This neg() { return neg(this); }
+    This mod(This other) { return This(cv.mod(other.cv)); }
+
+    [BackwardDifferentiable] T[N] toArray() { return toArray(this); }
+    [BackwardDifferentiable] vector<T, N> toVector() { return toVector(this); }
+
+    static Differential dzero() { return Differential(T(0.0f)); }
+    static Differential dadd(Differential a, Differential b) { return a + b; }
+    static Differential dmul<S : __BuiltinRealType>(S factor, Differential d) { return This(__realCast<T>(factor) * d.cv); }
+
+    [BackwardDerivative(fill_bwd)]
+    static This fill<S : IReal>(S x) { return This(CoopVec<T, N>(T(x.toFloat()))); }
+    [BackwardDerivative(cast_bwd)]
+    static This cast<S : IReal>(DiffCoopVec<S, N> x) { return This(CoopVec<S, N>(x.cv)); }
+    [BackwardDerivative(indexRead_bwd)]
+    static T indexRead(This x, int i) { return x.cv[i]; }
+    [BackwardDerivative(indexWrite_bwd)]
+    static void indexWrite(inout This x, int i, T value) { x.cv[i] = value; }
+    [BackwardDerivative(toArray_bwd)]
+    static T[N] toArray(This x)
+    {
+        T result[N];
+        for (int i = 0; i < N; ++i)
+            result[i] = x.cv[i];
+        return result;
+    }
+    [BackwardDerivative(toVector_bwd)]
+    static vector<T, N> toVector(This x)
+    {
+        vector<T, N> result;
+        for (int i = 0; i < N; ++i)
+            result[i] = x.cv[i];
+        return result;
+    }
+    [BackwardDerivative(fromArray_bwd)]
+    static This fromArray(T x[N])
+    {
+        CoopVec<T, N> cv;
+        for (int i = 0; i < N; ++i)
+            cv[i] = x[i];
+        return This(cv);
+    }
+    [BackwardDerivative(fromVector_bwd)]
+    static This fromVector(vector<T, N> x)
+    {
+        CoopVec<T, N> cv;
+        for (int i = 0; i < N; ++i)
+            cv[i] = x[i];
+        return This(cv);
+    }
+    [BackwardDerivative(add_bwd)] static This add(This a, This b) { return This(a.cv.add(b.cv)); }
+    [BackwardDerivative(sub_bwd)] static This sub(This a, This b) { return This(a.cv.sub(b.cv)); }
+    [BackwardDerivative(mul_bwd)] static This mul(This a, This b) { return This(a.cv.mul(b.cv)); }
+    [BackwardDerivative(div_bwd)] static This div(This a, This b) { return This(a.cv.div(b.cv)); }
+    [BackwardDerivative(neg_bwd)] static This neg(This x) { return This(x.cv.neg()); }
+
+    static void fill_bwd<S : IReal>(inout DifferentialPair<S> x, Differential grad)
+    {
+        T dx = T(0.0f);
+        [ForceUnroll]
+        for (int i = 0; i < N; ++i)
+            dx += grad[i];
+
+        x = diffPair(x.p, __slang_noop_cast<DifferentialPair<S>.DifferentialElementType>(S(dx.toFloat())));
+    }
+    static void cast_bwd<S : IReal>(inout DifferentialPair<DiffCoopVec<S, N>> x, Differential grad)
+    {
+        x = diffPair(x.p, DiffCoopVec<S, N>(CoopVec<S, N>(grad.cv)));
+    }
+    static void indexRead_bwd(inout DifferentialPair<This> x, int i, T.Differential grad)
+    {
+        Differential d = dzero();
+        indexWrite(d, i, __slang_noop_cast<T>(grad));
+        x = diffPair(x.p, d);
+    }
+    static void indexWrite_bwd(inout DifferentialPair<This> x, int i, inout DifferentialPair<T> value)
+    {
+        let grad = __slang_noop_cast<T.Differential>(indexRead(x.d, i));
+        value = diffPair(value.p, grad);
+    }
+    static void toArray_bwd(inout DifferentialPair<This> x, T.Differential[N] grad)
+    {
+        Differential dx;
+        for (int i = 0; i < N; ++i)
+            dx.cv[i] = __slang_noop_cast<T>(grad[i]);
+        x = diffPair(x.p, dx);
+    }
+    static void toVector_bwd(inout DifferentialPair<This> x, vector<T, N> grad)
+    {
+        Differential dx;
+        for (int i = 0; i < N; ++i)
+            dx.cv[i] = grad[i];
+        x = diffPair(x.p, dx);
+    }
+    static void fromArray_bwd(inout DifferentialPair<T[N]> x, This grad)
+    {
+        T dx[N];
+        for (int i = 0; i < N; ++i)
+            dx[i] = grad.cv[i];
+        x = diffPair(x.p, __slang_noop_cast<DifferentialPair<T[N]>.DifferentialElementType>(dx));
+    }
+    static void fromVector_bwd(inout DifferentialPair<vector<T, N>> x, This grad)
+    {
+        vector<T, N> dx;
+        for (int i = 0; i < N; ++i)
+            dx[i] = grad.cv[i];
+        x = diffPair(x.p, __slang_noop_cast<DifferentialPair<vector<T, N>>.DifferentialElementType>(dx));
+    }
+    static void add_bwd(inout DifferentialPair<This> a, inout DifferentialPair<This> b, Differential grad)
+    {
+        a = diffPair(a.p, grad);
+        b = diffPair(b.p, grad);
+    }
+    static void sub_bwd(inout DifferentialPair<This> a, inout DifferentialPair<This> b, Differential grad)
+    {
+        a = diffPair(a.p, grad);
+        b = diffPair(b.p, -grad);
+    }
+    static void mul_bwd(inout DifferentialPair<This> a, inout DifferentialPair<This> b, Differential grad)
+    {
+        a = diffPair(a.p, b.p * grad);
+        b = diffPair(b.p, a.p * grad);
+    }
+    static void div_bwd(inout DifferentialPair<This> a, inout DifferentialPair<This> b, Differential grad)
+    {
+        a = diffPair(a.p, grad / b.p);
+        b = diffPair(b.p, (-a.p * grad) / (b.p * b.p));
+    }
+    static void neg_bwd(inout DifferentialPair<This> x, Differential grad)
+    {
+        x = diffPair(x.p, -grad);
+    }
+}
+
+[BackwardDifferentiable] DiffCoopVec<S, N> operator +<T : IReal, S : IReal, int N>(DiffCoopVec<S, N> lhs, const T rhs) { return lhs + DiffCoopVec<S, N>(rhs); }
+[BackwardDifferentiable] DiffCoopVec<S, N> operator -<T : IReal, S : IReal, int N>(DiffCoopVec<S, N> lhs, const T rhs) { return lhs - DiffCoopVec<S, N>(rhs); }
+[BackwardDifferentiable] DiffCoopVec<S, N> operator /<T : IReal, S : IReal, int N>(DiffCoopVec<S, N> lhs, const T rhs) { return lhs / DiffCoopVec<S, N>(rhs); }
+[BackwardDifferentiable] DiffCoopVec<S, N> operator +<T : IReal, S : IReal, int N>(const T lhs, DiffCoopVec<S, N> rhs) { return DiffCoopVec<S, N>(lhs) + rhs; }
+[BackwardDifferentiable] DiffCoopVec<S, N> operator -<T : IReal, S : IReal, int N>(const T lhs, DiffCoopVec<S, N> rhs) { return DiffCoopVec<S, N>(lhs) - rhs; }
+[BackwardDifferentiable] DiffCoopVec<S, N> operator /<T : IReal, S : IReal, int N>(const T lhs, DiffCoopVec<S, N> rhs) { return DiffCoopVec<S, N>(lhs) / rhs; }
+[BackwardDerivative(scalarMultiplyR_bwd)] DiffCoopVec<S, N> operator *<T : IReal, S : IReal, int N>(DiffCoopVec<S, N> lhs, const T rhs) { return DiffCoopVec<S, N>(lhs.cv * S(rhs.toFloat())); }
+[BackwardDerivative(scalarMultiplyL_bwd)] DiffCoopVec<S, N> operator *<T : IReal, S : IReal, int N>(const T lhs, DiffCoopVec<S, N> rhs) { return DiffCoopVec<S, N>(S(lhs.toFloat()) * rhs.cv); }
+void scalarMultiplyR_bwd<T : IReal, S : IReal, int N>(inout DifferentialPair<DiffCoopVec<S, N>> lhs, inout DifferentialPair<T> rhs, DiffCoopVec<S, N> grad)
+{
+    lhs = diffPair(lhs.p, grad * rhs.p);
+    DiffCoopVec<S, N>::fill_bwd(rhs, grad * lhs.p);
+}
+void scalarMultiplyL_bwd<T : IReal, S : IReal, int N>(inout DifferentialPair<T> lhs, inout DifferentialPair<DiffCoopVec<S, N>> rhs, DiffCoopVec<S, N> grad)
+{
+    scalarMultiplyR_bwd(rhs, lhs, grad);
+}
+
+[BackwardDerivative(exp_bwd)]
+DiffCoopVec<T, N> exp<T : IReal, int N>(DiffCoopVec<T, N> x)
+{
+    return DiffCoopVec<T, N>(exp(x.cv));
+}
+void exp_bwd<T : IReal, int N>(inout DifferentialPair<DiffCoopVec<T, N>> x, DiffCoopVec<T, N> grad)
+{
+    x = diffPair(x.p, grad * exp(x.p));
+}
+
+[BackwardDerivative(log_bwd)]
+DiffCoopVec<T, N> log<T : IReal, int N>(DiffCoopVec<T, N> x)
+{
+    return DiffCoopVec<T, N>(log(x.cv));
+}
+void log_bwd<T : IReal, int N>(inout DifferentialPair<DiffCoopVec<T, N>> x, DiffCoopVec<T, N> grad)
+{
+    x = diffPair(x.p, grad / x.p);
+}
+
+[BackwardDerivative(tanh_bwd)]
+DiffCoopVec<T, N> tanh<T : IReal, int N>(DiffCoopVec<T, N> x)
+{
+    return DiffCoopVec<T, N>(tanh(x.cv));
+}
+void tanh_bwd<T : IReal, int N>(inout DifferentialPair<DiffCoopVec<T, N>> x, DiffCoopVec<T, N> grad)
+{
+    let y = tanh(x.p);
+    x = diffPair(x.p, (1.0f - y * y) * grad);
+}
+
+[BackwardDerivative(atan_bwd)]
+DiffCoopVec<T, N> atan<T : IReal, int N>(DiffCoopVec<T, N> x)
+{
+    return DiffCoopVec<T, N>(atan(x.cv));
+}
+void atan_bwd<T : IReal, int N>(inout DifferentialPair<DiffCoopVec<T, N>> x, DiffCoopVec<T, N> grad)
+{
+    x = diffPair(x.p, grad / (x.p * x.p + 1.0f));
+}
+
+[BackwardDerivative(max_bwd)]
+DiffCoopVec<T, N> max<T : IReal, int N>(DiffCoopVec<T, N> x, DiffCoopVec<T, N> y)
+{
+    return DiffCoopVec<T, N>(max(x.cv, y.cv));
+}
+void max_bwd<T : IReal, int N>(inout DifferentialPair<DiffCoopVec<T, N>> x, inout DifferentialPair<DiffCoopVec<T, N>> y, DiffCoopVec<T, N> grad)
+{
+    DiffCoopVec<T, N> gradX, gradY;
+    [ForceUnroll]
+    for (int i = 0; i < N; ++i)
+    {
+        if (x.p[i] > y.p[i])
+            gradX[i] = grad[i];
+        else
+            gradY[i] = grad[i];
+    }
+    x = diffPair(x.p, gradX);
+    y = diffPair(y.p, gradY);
+}
+
+[BackwardDerivative(min_bwd)]
+DiffCoopVec<T, N> min<T : IReal, int N>(DiffCoopVec<T, N> x, DiffCoopVec<T, N> y)
+{
+    return DiffCoopVec<T, N>(min(x.cv, y.cv));
+}
+void min_bwd<T : IReal, int N>(inout DifferentialPair<DiffCoopVec<T, N>> x, inout DifferentialPair<DiffCoopVec<T, N>> y, DiffCoopVec<T, N> grad)
+{
+    DiffCoopVec<T, N> gradX, gradY;
+    [ForceUnroll]
+    for (int i = 0; i < N; ++i)
+    {
+        if (x.p[i] > y.p[i])
+            gradY[i] = grad[i];
+        else
+            gradX[i] = grad[i];
+    }
+    x = diffPair(x.p, gradX);
+    y = diffPair(y.p, gradY);
+}
diff --git a/experiments/mipmap/nsc_nw_02_3layers_coopvec.py b/experiments/mipmap/nsc_nw_02_3layers_coopvec.py
new file mode 100644
index 0000000..7adab36
--- /dev/null
+++ b/experiments/mipmap/nsc_nw_02_3layers_coopvec.py
@@ -0,0 +1,158 @@
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from app import App
+import slangpy as spy
+import numpy as np
+import json
+
+# Create the app and load the slang module.
+app = App(width=512*3+10*2, height=512, title="Mipmap Example", device_type=spy.DeviceType.vulkan)
+module = spy.Module.load_from_file(app.device, "nsc_nw_02_3layers_coopvec.slang")
+
+# Load some materials.
+image = spy.Tensor.load_from_image(app.device,
+                                   "slangstars.png", linearize=False)
+
+np.random.seed(0)
+
+class NetworkParameters(spy.InstanceList):
+    def __init__(self, inputs: int, outputs: int):
+        super().__init__(module[f"NetworkParameters<{inputs},{outputs}>"])
+        self.inputs = inputs
+        self.outputs = outputs
+        self.layout = spy.CoopVecMatrixLayout.training_optimal
+
+        # Create initial values of biases and weights
+        weights_np = np.random.uniform(-0.5, 0.5, (outputs, inputs)).astype(np.float16)
+        biases_np = np.zeros(outputs).astype(np.float16)
+
+        # Convert weights into coopvec layout for training
+        desc = app.device.coopvec_create_matrix_desc(outputs, inputs, self.layout, spy.DataType.float16, 0)
+        weight_count = desc.size // 2 # sizeof(half)
+        params_np = np.zeros((weight_count, ), dtype=np.float16)
+        app.device.coopvec_convert_matrix_host(weights_np, params_np, dst_layout=self.layout)
+
+        # Create bias and weight tensors
+        self.biases = spy.Tensor.zeros(app.device, (outputs, ), dtype='half')
+        self.weights = spy.Tensor.zeros(app.device, (weight_count, ), dtype='half')
+        self.biases.copy_from_numpy(biases_np)
+        self.weights.copy_from_numpy(params_np)
+
+        # Gradients for the biases and weights.
+        self.biases_grad = spy.Tensor.zeros_like(self.biases)
+        self.weights_grad = spy.Tensor.zeros_like(self.weights)
+
+        # Moment buffers for Adam optimizer.
+        self.m_biases = spy.Tensor.zeros(app.device, self.biases.shape, 'float')
+        self.m_weights = spy.Tensor.zeros(app.device, self.weights.shape, 'float')
+        self.v_biases = spy.Tensor.zeros_like(self.m_biases)
+        self.v_weights = spy.Tensor.zeros_like(self.m_weights)
+
+        self.set_data({
+            'biases': self.biases.storage,
+            'weights': self.weights.storage,
+            'biasGrads': self.biases_grad.storage,
+            'weightGrads': self.weights_grad.storage,
+            '_type': f"NetworkParameters<{inputs},{outputs}>"
+        })
+
+    # Calls the Slang 'optimize' function for biases and weights
+    def optimize(self, learning_rate: float, optimize_counter: int):
+        module.optimize1(self.biases, self.biases_grad, self.m_biases, self.v_biases, learning_rate, optimize_counter)
+        module.optimize1(self.weights, self.weights_grad, self.m_weights, self.v_weights, learning_rate, optimize_counter)
+
+    def serialize(self):
+        params_np = self.weights.to_numpy()
+        weights_np = np.zeros((self.outputs, self.inputs), dtype=np.float16)
+        app.device.coopvec_convert_matrix_host(params_np, weights_np, src_layout=self.layout)
+
+        biases_np = self.biases.to_numpy()
+
+        return {
+            'num_inputs': self.inputs,
+            'num_outputs': self.outputs,
+            'weights': weights_np.flatten().tolist(),
+            'biases': biases_np.tolist()
+        }
+
+class Network(spy.InstanceList):
+    def __init__(self):
+        super().__init__(module["Network"])
+        self.layer0 = NetworkParameters(16,32)
+        self.layer1 = NetworkParameters(32,32)
+        self.layer2 = NetworkParameters(32,3)
+
+    # Calls the Slang 'optimize' function for the layer.
+    def optimize(self, learning_rate: float, optimize_counter: int):
+        self.layer0.optimize(learning_rate, optimize_counter)
+        self.layer1.optimize(learning_rate, optimize_counter)
+        self.layer2.optimize(learning_rate, optimize_counter)
+
+    def serialize(self):
+        return {
+            'layers': [
+                self.layer0.serialize(),
+                self.layer1.serialize(),
+                self.layer2.serialize()
+            ]
+        }
+
+if spy.Feature.cooperative_vector not in module.device.features:
+    raise RuntimeError("Device does not support cooperative vector API")
+
+network = Network()
+
+optimize_counter = 0
+
+while app.process_events():
+
+    # Blit tensor to screen.
+    offset = 0
+    app.blit(image, size=spy.int2(512), offset=spy.int2(offset,0), tonemap=False, bilinear=True)
+    offset += 512 + 10
+    res = spy.int2(256,256)
+
+    lr_output = spy.Tensor.empty_like(image)
+    module.render(pixel = spy.call_id(),
+                  resolution = res,
+                  network = network,
+                  _result = lr_output)
+
+    # Blit tensor to screen.
+    app.blit(lr_output, size=spy.int2(512, 512), offset=spy.int2(offset, 0), tonemap=False)
+    offset += 512 + 10
+
+    # Loss between downsampled output and quarter res rendered output.
+    loss_output = spy.Tensor.empty_like(image)
+    module.loss(pixel = spy.call_id(),
+                  resolution = res,
+                  network = network,
+                  reference = image,
+                  _result = loss_output)
+
+    # Blit tensor to screen.
+    app.blit(loss_output, size=spy.int2(512, 512), offset=spy.int2(offset, 0), tonemap=False)
+    offset += 512 + 10
+
+    learning_rate = 0.001
+
+    for i in range(50):
+        # Loss between downsampled output and quarter res rendered output.
+        module.calculate_grads(
+            seed = spy.wang_hash(seed=optimize_counter, warmup=2),
+            pixel = spy.call_id(),
+            resolution = res,
+            reference = image,
+            network = network)
+        optimize_counter += 1
+
+        network.optimize(learning_rate, optimize_counter)
+
+    print("Loss:", np.sum(np.abs(loss_output.to_numpy())))
+
+    # Present the window.
+    app.present()
+
+open('weights.json', 'w').write(json.dumps(network.serialize(), indent=4))
+
+
diff --git a/experiments/mipmap/nsc_nw_02_3layers_coopvec.slang b/experiments/mipmap/nsc_nw_02_3layers_coopvec.slang
new file mode 100644
index 0000000..7a9300f
--- /dev/null
+++ b/experiments/mipmap/nsc_nw_02_3layers_coopvec.slang
@@ -0,0 +1,152 @@
+import slangpy;
+import DiffCoopVec;
+
+#define PI 3.14159265358979323846f
+
+struct NetworkParameters<int Inputs, int Outputs>
+{
+    static const CoopVecComponentType ComponentType = CoopVecComponentType.Float16;
+
+    StructuredBuffer<half> weights, biases;
+    RWStructuredBuffer<half> weightGrads, biasGrads;
+
+    [BackwardDerivative(backward)]
+    DiffCoopVec<half, Outputs> forward(DiffCoopVec<half, Inputs> x)
+    {
+        return DiffCoopVec<half, Outputs>(coopVecMatMulAdd<half, Outputs>(
+            x.cv, ComponentType,
+            weights, 0, ComponentType,
+            biases, 0, ComponentType,
+            CoopVecMatrixLayout.TrainingOptimal, false, 0
+        ));
+    }
+
+    void backward(inout DifferentialPair<DiffCoopVec<half, Inputs>> x, DiffCoopVec<half, Outputs> grad)
+    {
+        coopVecOuterProductAccumulate(grad.cv, x.p.cv, weightGrads, 0, 0, CoopVecMatrixLayout.TrainingOptimal, ComponentType);
+        coopVecReduceSumAccumulate(grad.cv, biasGrads, 0);
+
+        let dX = coopVecMatMul<half, Inputs>(grad.cv, ComponentType, weights, 0, ComponentType, CoopVecMatrixLayout.TrainingOptimal, true, 0);
+
+        x = diffPair(x.p, DiffCoopVec<half, Inputs>(dX));
+    }
+}
+
+
+struct Network {
+    NetworkParameters<16, 32>  layer0;
+    NetworkParameters<32, 32> layer1;
+    NetworkParameters<32, 3>  layer2;
+
+    [Differentiable]
+    float3 eval(no_diff float2 uv)
+    {
+        DiffCoopVec<half, 16> inputs;
+        [ForceUnroll]
+        for (int i = 0; i < 4; ++i) {
+            float scale = float(2 << i);
+            inputs[i * 4 + 0] = half(sin(uv.x * PI * scale));
+            inputs[i * 4 + 1] = half(cos(uv.x * PI * scale));
+            inputs[i * 4 + 2] = half(sin(uv.y * PI * scale));
+            inputs[i * 4 + 3] = half(cos(uv.y * PI * scale));
+        }
+
+        var output0 = layer0.forward(inputs);
+        output0 = leakyReLU(output0);
+        var output1 = layer1.forward(output0);
+        output1 = leakyReLU(output1);
+        var output2 = layer2.forward(output1);
+        output2 = exp(output2);
+        return float3(output2.toVector());
+    }
+}
+
+[Differentiable]
+DiffCoopVec<half, N> activation<int N>(DiffCoopVec<half, N> x)
+{
+    return max(x, DiffCoopVec<half, N>(0.0h));
+}
+
+[Differentiable]
+DiffCoopVec<half, N> leakyReLU<int N>(DiffCoopVec<half, N> x)
+{
+    return max(x, DiffCoopVec<half, N>(0.0h)) + min(x, DiffCoopVec<half, N>(0.0h)) * 0.01h;
+}
+
+// Render full res BRDF from given inputs.
+[Differentiable]
+float3 render(int2 pixel, int2 resolution, Network network)
+{
+    float2 uv = (float2(pixel) + 0.5f) / float2(resolution);
+    return network.eval(uv);
+}
+
+[Differentiable]
+float3 loss(int2 pixel, int2 resolution, no_diff float3 reference, Network network)
+{
+    float3 color = render(pixel, resolution, network);
+    float3 error = color - reference;
+    return error * error; // Squared error
+}
+
+struct LCG
+{
+    uint state;
+
+    __init(uint seed) { state = seed; }
+
+    [mutating]
+    uint next_uint()
+    {
+        const uint A = 1664525u;
+        const uint C = 1013904223u;
+        state = (A * state + C);
+        return state;
+    }
+
+    [mutating]
+    float next_float()
+    {
+        // Convert to float in range [0, 1)
+        return (next_uint() >> 8) * 0x1p-24;
+    }
+};
+
+void optimize1(inout half primalH, inout half gradH, inout float m_prev, inout float v_prev, float learning_rate, int iteration)
+{
+    // Standard Adam default values.
+    const float ADAM_BETA_1 = 0.9;
+    const float ADAM_BETA_2 = 0.999;
+    const float ADAM_EPSILON = 1e-8;
+
+    float primal = float(primalH);
+    float grad = float(gradH);
+
+    if (isnan(grad) || isinf(grad))
+        grad = 0.0h;
+
+    // Adam optimization.
+    float gradient2 = grad * grad;
+
+    float m = ADAM_BETA_1 * m_prev + (1.0 - ADAM_BETA_1) * grad;
+    float v = ADAM_BETA_2 * v_prev + (1.0 - ADAM_BETA_2) * gradient2;
+
+    m_prev = m;
+    v_prev = v;
+
+    float mHat = m / (1.0f - pow(ADAM_BETA_1, iteration));
+    float vHat = v / (1.0f - pow(ADAM_BETA_2, iteration));
+
+    float update = learning_rate * (mHat / (sqrt(vHat) + ADAM_EPSILON));
+
+    // Subtract the optimized result from the trained normal and reset the gradient.
+    primal -= update;
+
+    primalH = half(primal);
+    gradH = 0.0h;
+}
+
+void calculate_grads(uint seed, int2 pixel, int2 resolution, float3 reference, Network network)
+{
+    bwd_diff(loss)(pixel, resolution, reference, network, 1.0f);
+}
diff --git a/experiments/mipmap/nsc_nw_inference_only.py b/experiments/mipmap/nsc_nw_inference_only.py
new file mode 100644
index 0000000..d97765e
--- /dev/null
+++ b/experiments/mipmap/nsc_nw_inference_only.py
@@ -0,0 +1,89 @@
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from app import App
+import slangpy as spy
+import numpy as np
+import json
+
+# Create the app and load the slang module.
+app = App(width=512*3+10*2, height=512, title="Mipmap Example", device_type=spy.DeviceType.vulkan)
+module = spy.Module.load_from_file(app.device, "nsc_nw_inference_only.slang")
+
+# Load some materials.
+image = spy.Tensor.load_from_image(app.device,
+                                   "slangstars.png", linearize=False)
+
+class NetworkParameters(spy.InstanceList):
+    def __init__(self, data: dict):
+        inputs, outputs = data['num_inputs'], data['num_outputs']
+        super().__init__(module[f"NetworkParameters<{inputs},{outputs}>"])
+
+        self.inputs = inputs
+        self.outputs = outputs
+        self.layout = spy.CoopVecMatrixLayout.inferencing_optimal
+
+        print(outputs, inputs, outputs * inputs, len(data['weights']))
+
+        # Load values of biases and weights
+        weights_np = np.array(data['weights'], dtype=np.float16).reshape((outputs, inputs))
+        biases_np = np.array(data['biases'], dtype=np.float16)
+
+        # Convert weights into coopvec layout for training
+        desc = app.device.coopvec_create_matrix_desc(self.outputs, self.inputs, self.layout, spy.DataType.float16, 0)
+        weight_count = desc.size // 2 # sizeof(half)
+        params_np = np.zeros((weight_count, ), dtype=np.float16)
+        app.device.coopvec_convert_matrix_host(weights_np, params_np, dst_layout=self.layout)
+
+        self.biases = app.device.create_buffer(struct_size=2, element_count=self.outputs, data=biases_np)
+        self.weights = app.device.create_buffer(struct_size=2, element_count=weight_count, data=params_np)
+
+class Network(spy.InstanceList):
+    def __init__(self, data: dict):
+        super().__init__(module["Network"])
+
+        assert len(data['layers']) == 3
+
+        self.layer0 = NetworkParameters(data['layers'][0])
+        self.layer1 = NetworkParameters(data['layers'][1])
+        self.layer2 = NetworkParameters(data['layers'][2])
+
+
+if spy.Feature.cooperative_vector not in module.device.features:
+    raise RuntimeError("Device does not support cooperative vector API")
+
+trained_weights = json.load(open('weights.json'))
+
+network = Network(trained_weights)
+
+while app.process_events():
+
+    # Blit tensor to screen.
+    offset = 0
+    app.blit(image, size=spy.int2(512), offset=spy.int2(offset,0), tonemap=False, bilinear=True)
+    offset += 512 + 10
+    res = spy.int2(256,256)
+
+    lr_output = spy.Tensor.empty_like(image)
+    module.render(pixel = spy.call_id(),
+                  resolution = res,
+                  network = network,
+                  _result = lr_output)
+
+    # Blit tensor to screen.
+    app.blit(lr_output, size=spy.int2(512, 512), offset=spy.int2(offset, 0), tonemap=False)
+    offset += 512 + 10
+
+    # Loss between downsampled output and quarter res rendered output.
+    loss_output = spy.Tensor.empty_like(image)
+    module.loss(pixel = spy.call_id(),
+                  resolution = res,
+                  network = network,
+                  reference = image,
+                  _result = loss_output)
+
+    # Blit tensor to screen.
+    app.blit(loss_output, size=spy.int2(512, 512), offset=spy.int2(offset, 0), tonemap=False)
+    offset += 512 + 10
+
+    # Present the window.
+    app.present()
diff --git a/experiments/mipmap/nsc_nw_inference_only.slang b/experiments/mipmap/nsc_nw_inference_only.slang
new file mode 100644
index 0000000..ba111a0
--- /dev/null
+++ b/experiments/mipmap/nsc_nw_inference_only.slang
@@ -0,0 +1,72 @@
+import slangpy;
+
+#define PI 3.14159265358979323846f
+
+struct NetworkParameters<int Inputs, int Outputs>
+{
+    static const CoopVecComponentType ComponentType = CoopVecComponentType.Float16;
+
+    StructuredBuffer<half> weights, biases;
+
+    CoopVec<half, Outputs> forward(CoopVec<half, Inputs> x)
+    {
+        return coopVecMatMulAdd<half, Outputs>(
+            x, ComponentType,
+            weights, 0, ComponentType,
+            biases, 0, ComponentType,
+            CoopVecMatrixLayout.InferencingOptimal, false, 0
+        );
+    }
+}
+
+
+struct Network {
+    NetworkParameters<16, 32>  layer0;
+    NetworkParameters<32, 32> layer1;
+    NetworkParameters<32, 3>  layer2;
+
+    float3 eval(no_diff float2 uv)
+    {
+        CoopVec<half, 16> inputs;
+        [ForceUnroll]
+        for (int i = 0; i < 4; ++i) {
+            float scale = float(2 << i);
+            inputs[i * 4 + 0] = half(sin(uv.x * PI * scale));
+            inputs[i * 4 + 1] = half(cos(uv.x * PI * scale));
+            inputs[i * 4 + 2] = half(sin(uv.y * PI * scale));
+            inputs[i * 4 + 3] = half(cos(uv.y * PI * scale));
+        }
+
+        var output0 = layer0.forward(inputs);
+        output0 = leakyReLU(output0);
+        var output1 = layer1.forward(output0);
+        output1 = leakyReLU(output1);
+        var output2 = layer2.forward(output1);
+        output2 = exp(output2);
+        return float3(output2[0], output2[1], output2[2]);
+    }
+}
+
+CoopVec<half, N> activation<int N>(CoopVec<half, N> x)
+{
+    return max(x, CoopVec<half, N>(0.0h));
+}
+
+CoopVec<half, N> leakyReLU<int N>(CoopVec<half, N> x)
+{
+    return max(x, CoopVec<half, N>(0.0h)) + min(x, CoopVec<half, N>(0.0h)) * 0.01h;
+}
+
+// Render full res BRDF from given inputs.
+float3 render(int2 pixel, int2 resolution, Network network)
+{
+    float2 uv = (float2(pixel) + 0.5f) / float2(resolution);
+    return network.eval(uv);
+}
+
+float3 loss(int2 pixel, int2 resolution, no_diff float3 reference, Network network)
+{
+    float3 color = render(pixel, resolution, network);
+    float3 error = color - reference;
+    return error * error; // Squared error
+}