Added InvMixColumns

nindanaoto · nindanaoto · commit 866937d7628b · 2025-05-10T01:58:47.000+09:00
diff --git a/include/aes.hpp b/include/aes.hpp
@@ -1,5 +1,6 @@
 #pragma once
 #include <AES.h>
+#include <ranges>
 // Transciphering by AES
 //  Based on Hippogryph
 namespace TFHEpp {
@@ -123,6 +124,112 @@ void InvShiftRows(std::array<TLWE<P>, 128> &res)
     ShiftRow<P,3,aesNb-3>(res);
 }
 
+template <class P>
+void MixColumn(std::array<TLWE<P>, 32>& y_out, const std::array<TLWE<P>, 32>& x){
+    // Temporary variables for intermediate XOR results, based on the 92-gate circuit
+    // Naming corresponds to t0...t59 and y0...y31 as per the circuit.
+    // We will use 't' for all intermediate values and 'y_out' for the final 32 output bits.
+    std::array<TLWE<P>, 60> t; // For t0...t59
+
+    // Implement the 92 XOR gates from Listing 1 in the provided PDF [cite: 28, 29]
+    // Note: The paper uses various symbols for XOR (e.g., ^, ~, -, ´). We interpret all as XOR.
+    // The indices for x will be 0-31.
+
+    // Implement the 92 XOR gates based on the user-provided circuit
+    TLWEAdd<P>(t[0], x[0], x[8]);
+    TLWEAdd<P>(t[1], x[16], x[24]);
+    TLWEAdd<P>(t[2], x[1], x[9]);
+    TLWEAdd<P>(t[3], x[17], x[25]);
+    TLWEAdd<P>(t[4], x[2], x[10]);
+    TLWEAdd<P>(t[5], x[18], x[26]);
+    TLWEAdd<P>(t[6], x[3], x[11]);
+    TLWEAdd<P>(t[7], x[19], x[27]);
+    TLWEAdd<P>(t[8], x[4], x[12]);
+    TLWEAdd<P>(t[9], x[20], x[28]);
+    TLWEAdd<P>(t[10], x[5], x[13]);
+    TLWEAdd<P>(t[11], x[21], x[29]);
+    TLWEAdd<P>(t[12], x[6], x[14]);
+    TLWEAdd<P>(t[13], x[22], x[30]);
+    TLWEAdd<P>(t[14], x[23], x[31]);
+    TLWEAdd<P>(t[15], x[7], x[15]);
+    TLWEAdd<P>(t[16], x[8], t[1]);
+    TLWEAdd<P>(y_out[0], t[15], t[16]);
+    TLWEAdd<P>(t[17], x[7], x[23]);
+    TLWEAdd<P>(t[18], x[24], t[0]);
+    TLWEAdd<P>(y_out[16], t[14], t[18]);
+    TLWEAdd<P>(t[19], t[1], y_out[16]);
+    TLWEAdd<P>(y_out[24], t[17], t[19]);
+    TLWEAdd<P>(t[20], x[27], t[14]);
+    TLWEAdd<P>(t[21], t[0], y_out[0]);
+    TLWEAdd<P>(y_out[8], t[17], t[21]);
+    TLWEAdd<P>(t[22], t[5], t[20]);
+    TLWEAdd<P>(y_out[19], t[6], t[22]);
+    TLWEAdd<P>(t[23], x[11], t[15]);
+    TLWEAdd<P>(t[24], t[7], t[23]);
+    TLWEAdd<P>(y_out[3], t[4], t[24]);
+    TLWEAdd<P>(t[25], x[2], x[18]);
+    TLWEAdd<P>(t[26], t[17], t[25]);
+    TLWEAdd<P>(t[27], t[9], t[23]);
+    TLWEAdd<P>(t[28], t[8], t[20]);
+    TLWEAdd<P>(t[29], x[10], t[2]);
+    TLWEAdd<P>(y_out[2], t[5], t[29]);
+    TLWEAdd<P>(t[30], x[26], t[3]);
+    TLWEAdd<P>(y_out[18], t[4], t[30]);
+    TLWEAdd<P>(t[31], x[9], x[25]);
+    TLWEAdd<P>(t[32], t[25], t[31]);
+    TLWEAdd<P>(y_out[10], t[30], t[32]);
+    TLWEAdd<P>(y_out[26], t[29], t[32]);
+    TLWEAdd<P>(t[33], x[1], t[18]);
+    TLWEAdd<P>(t[34], x[30], t[11]);
+    TLWEAdd<P>(y_out[22], t[12], t[34]);
+    TLWEAdd<P>(t[35], x[14], t[13]);
+    TLWEAdd<P>(y_out[6], t[10], t[35]);
+    TLWEAdd<P>(t[36], x[5], x[21]);
+    TLWEAdd<P>(t[37], x[30], t[17]);
+    TLWEAdd<P>(t[38], x[17], t[16]);
+    TLWEAdd<P>(t[39], x[13], t[8]);
+    TLWEAdd<P>(y_out[5], t[11], t[39]);
+    TLWEAdd<P>(t[40], x[12], t[36]);
+    TLWEAdd<P>(t[41], x[29], t[9]);
+    TLWEAdd<P>(y_out[21], t[10], t[41]);
+    TLWEAdd<P>(t[42], x[28], t[40]);
+    TLWEAdd<P>(y_out[13], t[41], t[42]);
+    TLWEAdd<P>(y_out[29], t[39], t[42]);
+    TLWEAdd<P>(t[43], x[15], t[12]);
+    TLWEAdd<P>(y_out[7], t[14], t[43]);
+    TLWEAdd<P>(t[44], x[14], t[37]);
+    TLWEAdd<P>(y_out[31], t[43], t[44]);
+    TLWEAdd<P>(t[45], x[31], t[13]);
+    TLWEAdd<P>(y_out[15], t[44], t[45]);
+    TLWEAdd<P>(y_out[23], t[15], t[45]);
+    TLWEAdd<P>(t[46], t[12], t[36]);
+    TLWEAdd<P>(y_out[14], y_out[6], t[46]);
+    TLWEAdd<P>(t[47], t[31], t[33]);
+    TLWEAdd<P>(y_out[17], t[19], t[47]);
+    TLWEAdd<P>(t[48], t[6], y_out[3]);
+    TLWEAdd<P>(y_out[11], t[26], t[48]);
+    TLWEAdd<P>(t[49], t[2], t[38]);
+    TLWEAdd<P>(y_out[25], y_out[24], t[49]);
+    TLWEAdd<P>(t[50], t[7], y_out[19]);
+    TLWEAdd<P>(y_out[27], t[26], t[50]);
+    TLWEAdd<P>(t[51], x[22], t[46]);
+    TLWEAdd<P>(y_out[30], t[11], t[51]);
+    TLWEAdd<P>(t[52], x[19], t[28]);
+    TLWEAdd<P>(y_out[20], x[28], t[52]);
+    TLWEAdd<P>(t[53], x[3], t[27]);
+    TLWEAdd<P>(y_out[4], x[12], t[53]);
+    TLWEAdd<P>(t[54], t[3], t[33]);
+    TLWEAdd<P>(y_out[9], y_out[8], t[54]);
+    TLWEAdd<P>(t[55], t[21], t[31]);
+    TLWEAdd<P>(y_out[1], t[38], t[55]);
+    TLWEAdd<P>(t[56], x[4], t[17]);
+    TLWEAdd<P>(t[57], x[19], t[56]);
+    TLWEAdd<P>(y_out[12], t[27], t[57]);
+    TLWEAdd<P>(t[58], x[3], t[28]);
+    TLWEAdd<P>(t[59], t[17], t[58]);
+    TLWEAdd<P>(y_out[28], x[20], t[59]);
+}
+
 // https://eprint.iacr.org/2019/833
 template <class P>
 void MixColumns(std::array<TLWE<P>, 128> &state) {
@@ -137,109 +244,64 @@ void MixColumns(std::array<TLWE<P>, 128> &state) {
                 x[i*8+j] = state[i*32 + col*8 + j];
         for (int i = 0; i < 32; ++i) x[i][P::k * P::n] += 1ULL << (std::numeric_limits<typename P::T>::digits - 2);
 
-        // Temporary variables for intermediate XOR results, based on the 92-gate circuit
-        // Naming corresponds to t0...t59 and y0...y31 as per the circuit.
-        // We will use 't' for all intermediate values and 'y_out' for the final 32 output bits.
-        std::array<TLWE<P>, 60> t; // For t0...t59
+        // Apply the MixColumn transformation
         std::array<TLWE<P>, 32> y_out; // For final output bits y0...y31
+        MixColumn<P>(y_out, x);
+
+        // Place the resulting 32-bit column (y_out) back into the state array
+        for (int i = 0; i < 4; ++i) 
+            for (int j = 0; j < 8; ++j)
+                state[i*32 + col*8 + j] = y_out[i*8+j];
+    }
+    for(int i = 0; i < 128; i++)
+        state[i][P::k*P::n] -= (1ULL << (std::numeric_limits<typename P::T>::digits - 2));
+}
+
+template <class P>
+void xxtimes(std::array<TLWE<P>,8> &statebyte)
+{
+    std::array<TLWE<P>, 8> tmp;
+    tmp[0] = statebyte[6];
+    TLWEAdd<P>(tmp[1], statebyte[6], statebyte[7]);
+    TLWEAdd<P>(tmp[2], statebyte[0], statebyte[7]);
+    TLWEAdd<P>(tmp[3], statebyte[1], statebyte[6]);
+    TLWEAdd<P>(tmp[4], statebyte[2], tmp[1]);
+    TLWEAdd<P>(tmp[5], statebyte[3], statebyte[7]);
+    tmp[6] = statebyte[4];
+    tmp[7] = statebyte[5];
+    for(int i = 0; i < 8; i++)
+        statebyte[i] = tmp[i];
+}
+
+// https://doi.org/10.1007/s13389-017-0176-3
+template <class P>
+void InvMixColumns(std::array<TLWE<P>, 128> &state) {
+    // The Inverse MixColumns operation is applied to each 32-bit column of the state.
+    // The AES state is 128 bits, so there are 4 such columns.
+
+    for (int col = 0; col < 4; ++col) {
+        // Extract the current 32-bit column into a working array (x0 to x31)
+        std::array<TLWE<P>, 32> x; // Input bits for the current column
+        for (int i = 0; i < 4; ++i) 
+            for (int j = 0; j < 8; ++j)
+                x[i*8+j] = state[i*32 + col*8 + j];
+        for (int i = 0; i < 32; ++i) x[i][P::k * P::n] += 1ULL << (std::numeric_limits<typename P::T>::digits - 2); // Shift to suppor XOR
 
-        // Implement the 92 XOR gates from Listing 1 in the provided PDF [cite: 28, 29]
-        // Note: The paper uses various symbols for XOR (e.g., ^, ~, -, ´). We interpret all as XOR.
-        // The indices for x will be 0-31.
-
-        // Implement the 92 XOR gates based on the user-provided circuit
-        TLWEAdd<P>(t[0], x[0], x[8]);
-        TLWEAdd<P>(t[1], x[16], x[24]);
-        TLWEAdd<P>(t[2], x[1], x[9]);
-        TLWEAdd<P>(t[3], x[17], x[25]);
-        TLWEAdd<P>(t[4], x[2], x[10]);
-        TLWEAdd<P>(t[5], x[18], x[26]);
-        TLWEAdd<P>(t[6], x[3], x[11]);
-        TLWEAdd<P>(t[7], x[19], x[27]);
-        TLWEAdd<P>(t[8], x[4], x[12]);
-        TLWEAdd<P>(t[9], x[20], x[28]);
-        TLWEAdd<P>(t[10], x[5], x[13]);
-        TLWEAdd<P>(t[11], x[21], x[29]);
-        TLWEAdd<P>(t[12], x[6], x[14]);
-        TLWEAdd<P>(t[13], x[22], x[30]);
-        TLWEAdd<P>(t[14], x[23], x[31]);
-        TLWEAdd<P>(t[15], x[7], x[15]);
-        TLWEAdd<P>(t[16], x[8], t[1]);
-        TLWEAdd<P>(y_out[0], t[15], t[16]);
-        TLWEAdd<P>(t[17], x[7], x[23]);
-        TLWEAdd<P>(t[18], x[24], t[0]);
-        TLWEAdd<P>(y_out[16], t[14], t[18]);
-        TLWEAdd<P>(t[19], t[1], y_out[16]);
-        TLWEAdd<P>(y_out[24], t[17], t[19]);
-        TLWEAdd<P>(t[20], x[27], t[14]);
-        TLWEAdd<P>(t[21], t[0], y_out[0]);
-        TLWEAdd<P>(y_out[8], t[17], t[21]);
-        TLWEAdd<P>(t[22], t[5], t[20]);
-        TLWEAdd<P>(y_out[19], t[6], t[22]);
-        TLWEAdd<P>(t[23], x[11], t[15]);
-        TLWEAdd<P>(t[24], t[7], t[23]);
-        TLWEAdd<P>(y_out[3], t[4], t[24]);
-        TLWEAdd<P>(t[25], x[2], x[18]);
-        TLWEAdd<P>(t[26], t[17], t[25]);
-        TLWEAdd<P>(t[27], t[9], t[23]);
-        TLWEAdd<P>(t[28], t[8], t[20]);
-        TLWEAdd<P>(t[29], x[10], t[2]);
-        TLWEAdd<P>(y_out[2], t[5], t[29]);
-        TLWEAdd<P>(t[30], x[26], t[3]);
-        TLWEAdd<P>(y_out[18], t[4], t[30]);
-        TLWEAdd<P>(t[31], x[9], x[25]);
-        TLWEAdd<P>(t[32], t[25], t[31]);
-        TLWEAdd<P>(y_out[10], t[30], t[32]);
-        TLWEAdd<P>(y_out[26], t[29], t[32]);
-        TLWEAdd<P>(t[33], x[1], t[18]);
-        TLWEAdd<P>(t[34], x[30], t[11]);
-        TLWEAdd<P>(y_out[22], t[12], t[34]);
-        TLWEAdd<P>(t[35], x[14], t[13]);
-        TLWEAdd<P>(y_out[6], t[10], t[35]);
-        TLWEAdd<P>(t[36], x[5], x[21]);
-        TLWEAdd<P>(t[37], x[30], t[17]);
-        TLWEAdd<P>(t[38], x[17], t[16]);
-        TLWEAdd<P>(t[39], x[13], t[8]);
-        TLWEAdd<P>(y_out[5], t[11], t[39]);
-        TLWEAdd<P>(t[40], x[12], t[36]);
-        TLWEAdd<P>(t[41], x[29], t[9]);
-        TLWEAdd<P>(y_out[21], t[10], t[41]);
-        TLWEAdd<P>(t[42], x[28], t[40]);
-        TLWEAdd<P>(y_out[13], t[41], t[42]);
-        TLWEAdd<P>(y_out[29], t[39], t[42]);
-        TLWEAdd<P>(t[43], x[15], t[12]);
-        TLWEAdd<P>(y_out[7], t[14], t[43]);
-        TLWEAdd<P>(t[44], x[14], t[37]);
-        TLWEAdd<P>(y_out[31], t[43], t[44]);
-        TLWEAdd<P>(t[45], x[31], t[13]);
-        TLWEAdd<P>(y_out[15], t[44], t[45]);
-        TLWEAdd<P>(y_out[23], t[15], t[45]);
-        TLWEAdd<P>(t[46], t[12], t[36]);
-        TLWEAdd<P>(y_out[14], y_out[6], t[46]);
-        TLWEAdd<P>(t[47], t[31], t[33]);
-        TLWEAdd<P>(y_out[17], t[19], t[47]);
-        TLWEAdd<P>(t[48], t[6], y_out[3]);
-        TLWEAdd<P>(y_out[11], t[26], t[48]);
-        TLWEAdd<P>(t[49], t[2], t[38]);
-        TLWEAdd<P>(y_out[25], y_out[24], t[49]);
-        TLWEAdd<P>(t[50], t[7], y_out[19]);
-        TLWEAdd<P>(y_out[27], t[26], t[50]);
-        TLWEAdd<P>(t[51], x[22], t[46]);
-        TLWEAdd<P>(y_out[30], t[11], t[51]);
-        TLWEAdd<P>(t[52], x[19], t[28]);
-        TLWEAdd<P>(y_out[20], x[28], t[52]);
-        TLWEAdd<P>(t[53], x[3], t[27]);
-        TLWEAdd<P>(y_out[4], x[12], t[53]);
-        TLWEAdd<P>(t[54], t[3], t[33]);
-        TLWEAdd<P>(y_out[9], y_out[8], t[54]);
-        TLWEAdd<P>(t[55], t[21], t[31]);
-        TLWEAdd<P>(y_out[1], t[38], t[55]);
-        TLWEAdd<P>(t[56], x[4], t[17]);
-        TLWEAdd<P>(t[57], x[19], t[56]);
-        TLWEAdd<P>(y_out[12], t[27], t[57]);
-        TLWEAdd<P>(t[58], x[3], t[28]);
-        TLWEAdd<P>(t[59], t[17], t[58]);
-        TLWEAdd<P>(y_out[28], x[20], t[59]);
+        // Apply the Inverse MixColumn transformation (Decomposed to fix and MixColumn)
+        for (int offset = 0; offset < 2; ++offset) {
+            std::array<TLWE<P>, 8> statebyte;
+            for(int i = 0; i < 8; i++)
+                TLWEAdd<P>(statebyte[i], x[offset*8+i], x[(2+offset)*8+i]);
+            xxtimes<P>(statebyte);
+            for(int i = 0; i < 8; i++)
+                TLWEAdd<P>(x[offset*8+i], x[offset*8+i], statebyte[i]);
+            for(int i = 0; i < 8; i++)
+                TLWEAdd<P>(x[(2+offset)*8+i], x[(2+offset)*8+i], statebyte[i]);
+        }
+
+        // Apply the MixColumn transformation
+        std::array<TLWE<P>, 32> y_out; // For final output bits y0...y31
+        MixColumn<P>(y_out, x);
 
         // Place the resulting 32-bit column (y_out) back into the state array
         for (int i = 0; i < 4; ++i) 
@@ -249,4 +311,5 @@ void MixColumns(std::array<TLWE<P>, 128> &state) {
     for(int i = 0; i < 128; i++)
         state[i][P::k*P::n] -= (1ULL << (std::numeric_limits<typename P::T>::digits - 2));
 }
+
 }  // namespace TFHEpp
diff --git a/test/invmixcolumns.cpp b/test/invmixcolumns.cpp
@@ -0,0 +1,83 @@
+#include <cassert>
+#include <chrono>
+#include <iostream>
+#include <memory>
+#include <random>
+#include <tfhe++.hpp>
+
+void InvMixColumns(unsigned char state[4][4]) {
+  unsigned char temp_state[4][4];
+
+  for (size_t i = 0; i < 4; ++i) {
+    memset(temp_state[i], 0, 4);
+  }
+
+  for (size_t i = 0; i < 4; ++i) {
+    for (size_t k = 0; k < 4; ++k) {
+      for (size_t j = 0; j < 4; ++j) {
+        temp_state[i][j] ^= GF_MUL_TABLE[INV_CMDS[i][k]][state[k][j]];
+      }
+    }
+  }
+
+  for (size_t i = 0; i < 4; ++i) {
+    memcpy(state[i], temp_state[i], 4);
+  }
+}
+
+int main()
+{
+    using P = TFHEpp::lvl2param;
+    std::random_device seed_gen;
+    std::default_random_engine engine(seed_gen());
+    std::uniform_int_distribution<uint32_t> binary(0, 1);
+
+    std::unique_ptr<TFHEpp::SecretKey> sk(new TFHEpp::SecretKey());
+    constexpr uint num_test = 1000;
+    std::vector<std::array<TFHEpp::TLWE<P>,128>> cstate(num_test);
+    std::vector<std::array<uint8_t, 128>> plaintext(num_test);
+
+    for (int i = 0; i < num_test; i++) {
+        for (int j = 0; j < 128; j++){
+            plaintext[i][j] = binary(engine);
+            cstate[i][j] = TFHEpp::tlweSymEncrypt<P>(plaintext[i][j]?1ULL << (std::numeric_limits<typename P::T>::digits - 2):-(1ULL << (std::numeric_limits<typename P::T>::digits - 2)), *sk);
+        }
+    }
+
+    std::chrono::system_clock::time_point start, end;
+    start = std::chrono::system_clock::now();
+    for (int test = 0; test < num_test; test++) {
+        // std::cout << "test: " << test << std::endl;
+        TFHEpp::InvMixColumns<P>(cstate[test]);
+    }
+
+    end = std::chrono::system_clock::now();
+    double elapsed =
+        std::chrono::duration_cast<std::chrono::milliseconds>(end - start)
+            .count();
+    std::cout << elapsed / num_test << "ms" << std::endl;
+
+    for (int i = 0; i < num_test; i++) {
+        std::array<uint8_t, 128> pres;
+        for (int j = 0; j < 128; j++)
+            pres[j] = TFHEpp::tlweSymDecrypt<P>(cstate[i][j], *sk);
+        unsigned char state[4][4];
+        for (int j = 0; j < 4; j++)
+            for (int k = 0; k < 4; k++){
+                uint8_t byte = 0;
+                for (int l = 0; l < 8; l++)
+                    byte |= plaintext[i][j*32 + k * 8 + l] << l;
+                state[j][k] = byte;
+            }
+        InvMixColumns(state);
+        for (int j = 0; j < 4; j++)
+            for (int k = 0; k < 4; k++){
+                uint8_t byte = 0;
+                for (int l = 0; l < 8; l++)
+                    byte |= pres[j*32 + k * 8 + l] << l;
+                // std::cout << (int)state[j][k] << " " << (int)byte << std::endl;
+                assert(state[j][k] == byte);
+            }
+    }
+  std::cout << "PASS" << std::endl;
+}
diff --git a/test/mixcolumns.cpp b/test/mixcolumns.cpp
@@ -82,4 +82,5 @@ int main()
                 assert(state[j][k] == byte);
             }
     }
+  std::cout << "PASS" << std::endl;
 }

Original file line number	Diff line number	Diff line change
`@@ -82,4 +82,5 @@ int main()`
`82`	`82`	`assert(state[j][k] == byte);`
`83`	`83`	`}`
`84`	`84`	`}`
	`85`	`+ std::cout << "PASS" << std::endl;`
`85`	`86`	`}`