NoiseResolution-HighDimData/Formatted code.Rmd at main · Muckthebuck/NoiseResolution-HighDimData · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
---
title: "R code for some plots and results"
output: html_notebook
---

# Code to obtain the plots and results

Please, ensure having run the code in the Functions Section below before running the following blocks.

```{r}
corr_analysis(Data_Pre)
corr_analysis(Data_Post)
```

```{r}
IPR_fun_KS(Data_Pre)
IPR_fun_KS(Data_Post)

IPR_fun_JB(Data_Pre)
IPR_fun_JB(Data_Post)
```

```{r}
mp_plotting(Data_Pre)
mp_plotting(Data_Post)
```

```{r}
MP_and_null(Data_Pre)
MP_and_null(Data_Post)
```

```{r}
temporal_corr_indexes(Data_Pre, 100, 20)
temporal_corr_indexes(Data_Pre, 100, 40)
temporal_corr_indexes(Data_Pre, 100, 60)
temporal_corr_indexes(Data_Pre, 100, 80)
temporal_corr_indexes(Data_Pre, 100, 100)

temporal_corr_indexes(Data_Post, 100, 20)
temporal_corr_indexes(Data_Post, 100, 40)
temporal_corr_indexes(Data_Post, 100, 60)
temporal_corr_indexes(Data_Post, 100, 80)
temporal_corr_indexes(Data_Post, 100, 100)
```

```{r}
plot_grid(Data_Pre)
plot_grid(Data_Post)
```

```{r}
residuals_matrix <- linear_model_residuals(Data_Pre)

plot(density(cor(residuals_matrix)), main = paste("Distribution of correlation matrices of Data_Post")
     , col = "orange", xlab = "Cross-correlaion coefficient", type = "l")
lines(density(cor(Data_Pre)), col = "green")
legend("topright", legend = c("First eigenvector removed", "Correlation matrix"),
       col = c("orange", "green"), lwd = 2, lty = c(1, 1))

residuals_matrix <- linear_model_residuals(Data_Post)

plot(density(cor(residuals_matrix)), main = paste("Distribution of correlation matrices of Data_Post")
     , col = "orange", xlab = "Cross-correlaion coefficient", type = "l")
lines(density(cor(Data_Pre)), col = "green")
legend("topright", legend = c("First eigenvector removed", "Correlation matrix"),
       col = c("orange", "green"), lwd = 2, lty = c(1, 1))
```

```{r}
residuals_matrix <- linear_model_residuals(Data_Pre)
get_top_entries(residuals_matrix)
smallest_eigenvectors(residuals_matrix)
```

```{r}
residuals_matrix <- linear_model_residuals(Data_Post)
get_top_entries(residuals_matrix)
smallest_eigenvectors(residuals_matrix)
```

```{r}
positions_matrix <- temporal_corr_tracking(Data_Pre)
plot_tracking_heatmap(positions_matrix = positions_matrix)
positions_matrix <- temporal_corr_tracking(Data_Post)
plot_tracking_heatmap(positions_matrix = positions_matrix)
```

# Functions section

Necessary libraries

```{r}
# Load necessary libraries
library(readr)
library(stats)
library(ggplot2)
library(MASS)
```

The following code chunk loads the data.

```{r}

# Load Data and transpose
Data_Pre <- read_csv("Datasets/Data_PreCovid_20170101_20200109.csv", col_names = FALSE)
Data_Post <- read_csv("Datasets/Data_PostCovid_20200110_20221231.csv", col_names = FALSE)

Data_Pre <- t(Data_Pre)
Data_Post <- t(Data_Post)
```

The following code chunk is a function used by many of the subsequent functions.

```{r}
plot_heatmap <- function(corr_matrix) {

  # Convert the correlation matrix to a long-format data frame using melt
  corr_melted <- melt(corr_matrix)

  # Number of rows in a square correlation matrix (assuming it is 45x45)
  n <- sqrt(nrow(corr_melted))  # Assuming the matrix is square and 45x45

  # Generate indices for adjusting Var1 and Var2
  j <- seq(from = 1, to = length(corr_melted$Var1), by = n)

  # Adjust Var1 and Var2 with appropriate indexing
  for (i in 1:length(j)) {
    if ((j[i] + (n - 1)) <= length(corr_melted$Var1)) {
      corr_melted$Var1[j[i]:(j[i] + (n - 1))] <- seq(1, n)
      corr_melted$Var2[j[i]:(j[i] + (n - 1))] <- rep(i, times = n)
    }
  }

  # Create the heatmap using ggplot2 with reversed y-axis
  ggplot(data = corr_melted, aes(x = Var1, y = Var2, fill = value)) +
    geom_tile() +
    scale_fill_gradient2(low = "blue", high = "red", mid = "white",
                         midpoint = 0, limit = c(-1, 1), space = "Lab",
                         name = "Correlation", na.value = "green") +  # Ensure NA values are white
    scale_y_reverse() +  # Reverse the y-axis
    theme_minimal() +
    labs(title = "Heatmap of Correlation Matrix",
         x = "Variables",
         y = "Variables") +
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1, size = 14),  # Increased font size
          axis.text.y = element_text(size = 14),  # Increased font size
          axis.title.x = element_text(size = 16),  # Font size for x-axis label
          axis.title.y = element_text(size = 16))  # Font size for y-axis label
}
```

Function to obtain the heatmap of the correlation matrix of a dataset.

```{r}
corr_analysis <- function(Dataset){
  corr_matrix <- cor(Dataset)
  print(plot_heatmap(corr_matrix = corr_matrix))
}
```

Two functions to check gaussianity via the Kolmogorov-Smirnov test and the Jarque-Baret hypothesis test. Load it as it is used by other functions.

```{r}
# Function to check for gaussianity in the eigenvectors
# Function to apply the Kolmogorov-Smirnov test and return non-Gaussian eigenvectors and their positions
check_gaussianity <- function(Dataset, alpha = 0.05) {
  non_gaussian <- list()  # To store non-Gaussian eigenvectors
  indices <- vector()     # To store corresponding indices

  corr_matrix <- scaled(Dataset = Dataset)

  eigenvectors <- eigen(corr_matrix, symmetric = T)$vectors

  for (i in 1:ncol(eigenvectors)) {
    eigenvector <- eigenvectors[, i]

    # Perform the KS test comparing the eigenvector to a normal distribution
    ks_test <- ks.test(eigenvector, "pnorm", mean = mean(eigenvector), sd = sd(eigenvector))

    # If the p-value is less than alpha, the eigenvector is considered non-Gaussian
    if (ks_test$p.value < alpha) {
      non_gaussian[[length(non_gaussian) + 1]] <- eigenvector
      indices <- c(indices, i)  # Save index
    }
  }
  return(list(non_gaussian_vectors = non_gaussian, positions = indices))
}

JB_gaussianity <- function(Dataset, alpha = 0.01) {
  non_gaussian <- list()  # To store non-Gaussian eigenvectors
  indices <- vector()     # To store corresponding indices

  # Compute the correlation matrix
  corr_matrix <- cor(Dataset)

  # Perform eigen decomposition
  eigenvectors <- eigen(corr_matrix, symmetric = TRUE)$vectors

  # Iterate over each eigenvector
  for (i in 1:ncol(eigenvectors)) {
    eigenvector <- eigenvectors[, i]

    # Perform the Jarque-Bera test
    jb_test <- jarque.bera.test(eigenvector)

    # If the p-value is less than alpha, the eigenvector is considered non-Gaussian
    if (jb_test$p.value < alpha) {
      non_gaussian[[length(non_gaussian) + 1]] <- eigenvector
      indices <- c(indices, i)  # Save index
    }
  }

  # Return the list of non-Gaussian eigenvectors and their indices
  return(list(non_gaussian_vectors = non_gaussian, positions = indices))
}
```

Two functions to obtain two null models: a GOE and a shuffled data. Load it as it is used by other functions.

```{r}
generate_GOE <- function(m, n) {
  # Create a matrix with normally distributed random numbers
  GOE_matrix <- matrix(rnorm(n * m), nrow = m, ncol = n)

  return(GOE_matrix)
}

randomize_null <- function(Dataset) {

  n_rows <- nrow(Dataset)
  n_cols <- ncol(Dataset)

  # Flatten the Dataset into a vector
  Dataset_vector <- as.vector(Dataset)

  # Randomize the positions
  order <- sample(1:length(Dataset_vector), size = length(Dataset_vector), replace = FALSE)

  # Create a new randomized matrix with the same dimensions
  Dataset_randomized <- matrix(Dataset_vector[order], nrow = n_rows, ncol = n_cols)

  return(Dataset_randomized)
}
```

Function to calculate the IPR along with the Kolmogorov-Smirnov test

```{r}
IPR_fun_KS <- function(Dataset) {

  # Number of variables (N) and observations (M) for MP-Law parameters
  N <- ncol(Dataset)  # Number of variables
  M <- nrow(Dataset)  # Number of observations
  Q <- N / M          # Shape parameter (N / M)

  # Calculate the range for the Marchenko-Pastur distribution
  lambda_min <- (1 - sqrt(Q))^2
  lambda_max <- (1 + sqrt(Q))^2

  # Fit the Marchenko-Pastur distribution to get the fitted bounds
  corr_matrix <- cor(Dataset)
  eigenvalues <- eigen(corr_matrix, symmetric = TRUE)$values


  if (Dataset == Data_Pre){
  lambda_plus_fitted <- 1.029
  lambda_minus_fitted <- 0.1138
  } else{
  lambda_plus_fitted <- 0.872
  lambda_minus_fitted <- 0.083
  }

  vectors <- eigen(corr_matrix)$vectors
  values <- eigenvalues

  IPRs <- vector(mode = "numeric", length = 98)

  for(i in 1:nrow(corr_matrix)){
    IPRs[i] <- sum(vectors[,i]^4)
  }

  non_gaussian <- check_gaussianity(Dataset)

  # Highlight and plot non-Gaussian eigenvalues
  non_gaussian_positions <- non_gaussian[["positions"]]

  data_plot <- data.frame(eigenvalues = values, IPR = IPRs, gaussian_positions = rep("blue", times = 98))
  data_plot$gaussian_positions[non_gaussian_positions] <- "green"

  # Plot the IPR against eigenvalues
  plot(x = log10(data_plot$eigenvalues), y = log10(data_plot$IPR), type = "b",
       xlab = "log(Eigenvalues)", ylab = "log(Inverse Participation Ratio)", col = data_plot$gaussian_positions,
       main = paste("Plot of IPR against eigenvalues of ", deparse(substitute(Dataset)), "using Kolmogorov-Smirnov test"),
       xlim = c(-2, 1.5), ylim = c(-2, -0.2), cex.axis = 1.5, cex.lab = 1.5)

  # Highlight the Marchenko-Pastur distribution range (Gray Rectangle)
  rect(log10(lambda_min), -2, log10(lambda_max), 0, col = rgb(0.5, 0.5, 0.5, 1/4))
  abline(v = log10(lambda_min), col = "red")
  abline(v = log10(lambda_max), col = "red")

  # Add vertical pink lines for the fitted values (Fitted MP-Law)
  rect(log10(lambda_minus_fitted), -2, log10(lambda_plus_fitted), 0, col = alpha("pink", 0.5))
  abline(v = log10(lambda_plus_fitted), col = "pink", lty = 2, lwd = 2)  # Upper fitted bound
  abline(v = log10(lambda_minus_fitted), col = "pink", lty = 2, lwd = 2)  # Lower fitted bound

  # Draw the black lines connecting the points
  lines(log10(data_plot$eigenvalues), log10(data_plot$IPR), col = "black", lty = 1)

  # Add a legend with information about the bounds
  legend("topright",
         legend = c("Associated to Non-Gaussian eigenvectors",
                    "Associated to Gaussian eigenvectors",
                    "MP-Law bounds",
                    "Fitted MP-Law bounds"),
         col = c("green", "blue", rgb(0.5, 0.5, 0.5, 1/4), alpha("pink", 0.5)),
         pch = c(16, 16, 15, 15),  # Use filled squares for rectangles
         pt.cex = 2,
         lty = c(NA, NA, NA, 2),  # Line type for pink bounds
         lwd = c(NA, NA, NA, 2))  # Line width for pink bounds
}
```

Function to calculate the IPR along with the Jarque-Baret hypothesis test

```{r}
IPR_fun_JB <- function(Dataset){

  # Number of variables (N) and observations (M) for MP-Law parameters
  N <- ncol(Dataset)  # Number of variables
  M <- nrow(Dataset)  # Number of observations
  Q <- N / M          # Shape parameter (N / M)

  # Calculate the range for the Marchenko-Pastur distribution
  lambda_min <- (1 - sqrt(Q))^2
  lambda_max <- (1 + sqrt(Q))^2

  if (Dataset == Data_Pre){
  lambda_plus_fitted <- 1.029
  lambda_minus_fitted <- 0.1138
  } else{
  lambda_plus_fitted <- 0.872
  lambda_minus_fitted <- 0.083
  }

  # Compute the correlation matrix
  corr_matrix <- cor(Dataset)

  # Perform eigen decomposition
  eigen_decomp <- eigen(corr_matrix)
  vectors <- eigen_decomp$vectors
  values <- eigen_decomp$values

  # Calculate IPR for each eigenvector
  IPRs <- vector(mode = "numeric", length = N)
  for(i in 1:N){
    IPRs[i] <- sum(vectors[, i]^4)
  }

  # Check Gaussianity using the Jarque-Bera test (updated function)
  non_gaussian <- JB_gaussianity(Dataset = Dataset)

  # Extract non-Gaussian eigenvector positions
  non_gaussian_positions <- non_gaussian[["positions"]]

  # Create a data frame for plotting
  data_plot <- data.frame(eigenvalues = values, IPR = IPRs, gaussian_positions = rep("blue", times = N))

  # Mark non-Gaussian eigenvectors in green
  data_plot$gaussian_positions[non_gaussian_positions] <- "green"
  print(data_plot)
  # Plot IPR against eigenvalues with log scale
  plot(x = log10(data_plot$eigenvalues), y = log10(data_plot$IPR), type = "b",
       xlab = "log(Eigenvalues)", ylab = "log(Inverse Participation Ratio)", col = data_plot$gaussian_positions,
       main = paste("Plot of IPR against eigenvalues of", deparse(substitute(Dataset)), "using Jarque-Bera test"),
       xlim = c(-2, 1.5), ylim = c(-2, -0.2), cex.lab = 1.5, cex.axis = 1.5)

  # Add vertical pink lines for the fitted values
  rect(log10(lambda_plus_fitted), -2, log10(lambda_minus_fitted), 0, col = alpha("pink",0.5), )
  abline(v = log10(lambda_plus_fitted), col = "pink", lty = 2, lwd = 2)  # Upper fitted bound
  abline(v = log10(lambda_minus_fitted), col = "pink", lty = 2, lwd = 2)  # Lower fitted bound

  # Draw the black lines connecting the points
  lines(log10(data_plot$eigenvalues), log10(data_plot$IPR), col = "black", lty = 1)

  # Highlight the Marchenko-Pastur distribution range
  rect(log10(lambda_min), -2, log10(lambda_max), 0, col = rgb(0.5, 0.5, 0.5, 1/4))
  abline(v = log10(lambda_min), col = "red")
  abline(v = log10(lambda_max), col = "red")

  # Add a legend with information about the bounds
  legend("topright",
         legend = c("Associated to Non-Gaussian eigenvectors",
                    "Associated to Gaussian eigenvectors",
                    "MP-Law bounds",
                    "Fitted MP-Law bounds"),
         col = c("green", "blue", rgb(0.5, 0.5, 0.5, 1/4), alpha("pink", 0.5)),
         pch = c(16, 16, 15, 15),  # Use filled squares for rectangles
         pt.cex = 2,
         lty = c(NA, NA, NA, 2),  # Line type for pink bounds
         lwd = c(NA, NA, NA, 2))  # Line width for pink bounds
}
```

Function to calculate the MP-Law on a dataset

```{r}
mp_density <- function(lambda, Q) {
  # Calculate the range for the Marchenko-Pastur distribution
  lambda_min <- (1 - sqrt(Q))^2
  lambda_max <- (1 + sqrt(Q))^2
  sqrt_part <- sqrt((lambda_max - lambda) * (lambda - lambda_min))
  return((1 / (2 * pi * Q * lambda)) * (sqrt_part))
}
```

This function produces the histogram of the eigenvalues with the MP-law and GOE histogram of eigenvalues overplotted

```{r}
mp_plotting <- function(Dataset){

  # Scale and compute the correlation matrix
  corr_matrix <- cor(Dataset)

  # Get eigenvalues from the correlation matrix
  eigenval <- eigen(corr_matrix, symmetric = TRUE)$values

  # Plot histogram of eigenvalues from the dataset
  hist(eigenval, breaks = 200, probability = TRUE,
       main = paste("Histogram of Eigenvalues with MP Law", deparse(substitute(Dataset))),
       xlab = "Eigenvalues", col = "lightblue", ylim = c(0, 1.5), xlim = c(0,10), cex.axis = 1.5)

  # Number of variables (N) and observations (M) for MP-Law parameters
  N <- ncol(Dataset)  # Number of variables
  M <- nrow(Dataset)  # Number of observations
  Q <- N / M          # Shape parameter (N / M)

  # Calculate the range for the Marchenko-Pastur distribution
  lambda_min <- (1 - sqrt(Q))^2
  lambda_max <- (1 + sqrt(Q))^2

  # Sequence for lambda and MP distribution values
  lambda_seq <- seq(lambda_min, lambda_max, length.out = 1000)
  mp_values <- sapply(lambda_seq, mp_density, Q = Q)

  # Add MP Law curve to the plot
  lines(lambda_seq, mp_values, col = "red", lwd = 2, lty = 2)

  # Generate a GOE matrix of size N x N (same size as correlation matrix)
  #GOE_matrix <- generate_GOE(m = M, n = N)

  #GOE_corr <- cor(GOE_matrix)

  #GOE_eigen <- eigen(GOE_corr)$values

  # Flatten the GOE matrix to a vector to get all entries
  #GOE_entries <- as.vector(GOE_eigen)

  # Add density curve for the GOE matrix
  #lines(density(GOE_entries), col = "green", lwd = 2)

  # Add legend
  legend("topright", legend = c("Eigenvalues", "MP-Law"),
         col = c("lightblue", "red"), lwd = 2, lty = c(1, 2, 1))
}
```

Function to compute the MP against the shuffled data

```{r}
MP_and_null <- function(Dataset){
  # Number of variables (N) and observations (M) for MP-Law parameters
  N <- ncol(Dataset)  # Number of variables
  M <- nrow(Dataset)  # Number of observations
  Q <- N / M          # Shape parameter (N / M)

  GOE_matrix <- generate_GOE(m = M, n = N)

  GOE_corr <- cor(GOE_matrix)

  GOE_eigen <- eigen(GOE_corr)$values

  # Flatten the GOE matrix to a vector to get all entries
  GOE_entries <- as.vector(GOE_eigen)

  control <- randomize_null(Dataset)

  control <- eigen(cor(control))$values


  # Calculate the range for the Marchenko-Pastur distribution
  lambda_min <- (1 - sqrt(Q))^2
  lambda_max <- (1 + sqrt(Q))^2

  # Sequence for lambda and MP distribution values
  lambda_seq <- seq(lambda_min, lambda_max, length.out = 1000)
  mp_values <- sapply(lambda_seq, mp_density, Q = Q)

  plot(y = density(GOE_entries)$y, x = density(GOE_entries)$x, type = "l", ylim = c(0, 1.2), main = paste("Comparison of histograms of",deparse(substitute(Dataset)))
       , ylab = "density", xlab = "Eigenvalues", cex.lab = 1.7, cex.axis = 1.7)

  # Add density curve for the GOE matrix
  lines(lambda_seq, mp_values, col = "red", lwd = 2, lty = 2)
  lines(density(control), col = "orange", lwd = 2)

  #Add legend
  legend("topright", legend = c("Control shuffle", "MP-Law", "GOE Entries"),
         col = c("orange", "red", "black"), lwd = 2, lty = c(1, 2, 1))

}
```

Function to track change in the top 10 entries.

```{r}
# Tracking changes in top 10 entries
temporal_corr_indexes <- function(Dataset, window_size, step_width) {

  # Number of variables (N) and observations (M) for MP-Law parameters
  N <- ncol(Dataset)  # Number of variables
  M <- nrow(Dataset)  # Number of observations
  Q <- N / M          # Shape parameter (N / M)

  # Maximum and minimum eigenvalues based on MP-Law
  lambda_max <- (1 + sqrt(Q))^2

  # Eigenvalue decomposition of the correlation matrix of the dataset
  eigenvalues <- eigen(cor(Dataset))$values

  # Find positions of eigenvalues outside the [lambda_min, lambda_max] range
  positions <- which(eigenvalues > lambda_max)
  num_positions <- length(positions)

  # Calculate the number of cycles
  cycles <- floor((nrow(Dataset) - (window_size + 1)) / step_width)

  # Initialize a vector to track changes for each eigenvector
  change_counts <- matrix(NA, nrow = cycles, ncol = num_positions)

  for (i in 1:cycles) {
    # Adjust the sliding window calculation
    start_A <- i
    end_A <- start_A + window_size
    start_B <- start_A + step_width
    end_B <- start_B + window_size

    if (end_B > nrow(Dataset)) break

    Data_A <- Dataset[start_A:end_A, ]
    Data_B <- Dataset[start_B:end_B, ]

    # Compute correlation matrices
    Data_A <- cor(Data_A)
    Data_B <- cor(Data_B)

    # Eigen decomposition
    eigen_A <- eigen(Data_A, symmetric = TRUE)
    eigen_B <- eigen(Data_B, symmetric = TRUE)

    # Select only the eigenvectors based on the 'positions'
    vectors_A <- eigen_A$vectors[, positions, drop = FALSE]  # Ensure matrix format
    vectors_B <- eigen_B$vectors[, positions, drop = FALSE]  # Ensure matrix format

    # Track changes in the top 10 entries for each eigenvector
    for(n in 1:num_positions){
      # Get the positions of the top 10 entries in absolute value
      top_10_A <- order(abs(vectors_A[, n]), decreasing = TRUE)[1:10]
      top_10_B <- order(abs(vectors_B[, n]), decreasing = TRUE)[1:10]
      # Count how many positions are different between vectors_A and vectors_B
      change_counts[i,n] <- sum(!top_10_A %in% top_10_B)
    }
  }
  change_counts <- change_counts/10
  # Convert matrix to data frame for labeling purposes
  change_counts_df <- as.data.frame(change_counts)

  # Create a boxplot for each column
  boxplot(change_counts_df,
          xlab = "Eigenvectors",
          ylab = "change",
          col = "lightblue",
          border = "darkblue",
          notch = F,
          ylim = c(0,1),
          cex.axis = 1.5,
          cex.lab = 1.5)
}
```

This function provides the heatmap of the eigenvectors to the right of the bulk

```{r}
#Heatmaps with cropping
temporal_corr_cropped <- function(Dataset, window_size, step_width) {

  # Number of variables (N) and observations (M) for MP-Law parameters
  N <- ncol(Dataset)  # Number of variables
  M <- nrow(Dataset)  # Number of observations
  Q <- N / M          # Shape parameter (N / M)

  # Maximum and minimum eigenvalues based on MP-Law
  lambda_max <- (1 + sqrt(Q))^2
  #lambda_min <- (1 - sqrt(Q))^2

  # Eigenvalue decomposition of the correlation matrix of the dataset
  eigenvalues <- eigen(cor(Dataset))$values

  # Find positions of eigenvalues outside the [lambda_min, lambda_max] range
  positions <- which(eigenvalues > lambda_max)
  num_positions <- length(positions)

  # Initialize the temporal average matrix with the correct dimensions
  temporal_average <- matrix(0, nrow = num_positions, ncol = num_positions)

  # Set row and column names based on positions
  rownames(temporal_average) <- positions
  colnames(temporal_average) <- positions

  # Calculate the number of cycles
  cycles <- floor((nrow(Dataset) - (window_size + 1)) / step_width)

  for (i in 1:cycles) {
    # Adjust the sliding window calculation
    start_A <- i
    end_A <- start_A + window_size
    start_B <- start_A + step_width
    end_B <- start_B + window_size

    if (end_B > nrow(Dataset)) break

    Data_A <- Dataset[start_A:end_A, ]
    Data_B <- Dataset[start_B:end_B, ]

    # Compute correlation matrices
    Data_A <- cor(Data_A)
    Data_B <- cor(Data_B)

    # Eigen decomposition
    eigen_A <- eigen(Data_A, symmetric = TRUE)
    eigen_B <- eigen(Data_B, symmetric = TRUE)

    # Select only the eigenvectors based on the 'positions'
    vectors_A <- eigen_A$vectors[, positions, drop = FALSE]  # Ensure matrix format
    vectors_B <- eigen_B$vectors[, positions, drop = FALSE]  # Ensure matrix format

    # Calculate temporal correlation matrix
    temporal <- t(vectors_A) %*% vectors_B

    # Accumulate the temporal correlation values
    temporal_average <- temporal_average + temporal
  }

  # Average the temporal correlations across all cycles
  temporal_average <- temporal_average / i

  # Final heatmap with correct labels
  plot_heatmap(temporal_average)
}

# Assuming 'temporal_corr_cropped' returns a ggplot object
plot_grid <- function(Dataset) {

  # Initialize an empty list to store the plots
  plot_list <- list()

  # Loop through different step_width values (20, 40, 60, 80, 100)
  for (i in 1:5) {
    step_width <- i * 20
    # Call the function and store the result in the list
    plot_list[[i]] <- temporal_corr_cropped(Dataset, window_size = 100, step_width = step_width)
  }

  # Arrange the 5 plots in a grid
  grid.arrange(grobs = plot_list, ncol = 2)
}
```

Function to compute the linear model

```{r}
linear_model_residuals <- function(Dataset) {

  N <- nrow(Dataset)
  M <- ncol(Dataset)
  scaled_data <- scale(Dataset)
  corr_matrix <- cor(Dataset)

  first_eigenvector <- eigen(corr_matrix)$vectors[, 1]

  G_t <- vector(mode = "numeric", length = N)
  M_t <- vector(mode = "numeric", length = N)

  # Initialize a matrix to store residuals
  residuals_matrix <- matrix(nrow = N, ncol = M)

  for (j in 1:M) {
    for (i in 1:N) {
      M_t[i] <- t(first_eigenvector) %*% scaled_data[i, ]
      G_t[i] <- scaled_data[i, j]
    }

    # Fit the linear model and store residuals in the corresponding column
    residuals_matrix[, j] <- lm(G_t ~ M_t)$residuals
  }

  return(residuals_matrix)
}
```

```{r}
residuals_matrix <- linear_model_residuals(Data_Post)

plot(density(cor(residuals_matrix)), main = paste("Distribution of correlation matrices of Data_Post")
     , col = "orange", xlab = "Cross-correlaion coefficient", type = "l")
lines(density(cor(Data_Pre)), col = "green")
legend("topright", legend = c("First eigenvector removed", "Correlation matrix"),
       col = c("orange", "green"), lwd = 2, lty = c(1, 1))
```

Function to get the first 10 top entries from the eigenvectors to the right of the bulk

```{r}
get_top_entries <- function(residuals_matrix) {
  # Compute the correlation matrix of the residuals
  corr_residuals <- cor(residuals_matrix)

  # Perform eigen decomposition and extract the first 10 eigenvectors
  eigenvectors <- eigen(corr_residuals)$vectors[, 1:10]

  # Initialize a list to store the top 10 entries for each eigenvector
  top_entries <- list()

  # Loop over the first 10 eigenvectors
  for (i in 1:10) {
    # Get the absolute values of the eigenvector entries
    abs_entries <- abs(eigenvectors[, i])

    # Find the indices of the top 10 largest absolute values
    top_indices <- order(abs_entries, decreasing = TRUE)[1:10]

    # Store the indices and corresponding values in the list
    top_entries[[i]] <- list(indices = top_indices, values = eigenvectors[top_indices, i])
  }

  return(top_entries)
}
```

Function to get the first 2 top entries from the first 5 eigenvectors to the left of the bulk

```{r}
smallest_eigenvectors <- function(residuals_matrix){

  # Compute the correlation matrix of the residuals
  corr_residuals <- cor(residuals_matrix)

  # Perform eigen decomposition and extract the first 10 eigenvectors
  eigenvectors <- eigen(corr_residuals)$vectors
  eigenvectors <- eigenvectors[, tail(1:ncol(eigenvectors), 5)]

  # Initialize a list to store the top 10 entries for each eigenvector
  top_entries <- list()

  # Loop over the first 10 eigenvectors
  for (i in 1:ncol(eigenvectors)) {
    # Get the absolute values of the eigenvector entries
    abs_entries <- abs(eigenvectors[, i])

    # Find the indices of the top 2 largest absolute values
    top_indices <- order(abs_entries, decreasing = TRUE)[1:2]

    # Store the indices and corresponding values in the list
    top_entries[[i]] <- list(indices = top_indices, values = eigenvectors[top_indices, i])
  }

  return(top_entries)
}
```

Function to study the first 10 entries of eigenvectors 1

```{r}
temporal_corr_tracking <- function(Dataset, window_size = 100, step_width = 20, n = 10) {

  # Number of cycles based on window size and step width
  cycles <- floor((nrow(Dataset) - (window_size + 1)) / step_width)

  # Initialize the matrix 'positions' to keep track of each of the top n entries
  positions <- matrix(NA, nrow = n, ncol = cycles + 1)

  # Start by computing the initial eigen decomposition to get the top n indices
  Data_A <- Dataset[1:window_size, ]
  eigen_A <- eigen(cor(Data_A), symmetric = TRUE)
  first_eigenvector <- eigen_A$vectors[, 1]

  # Get the indices of the top n largest entries in absolute value
  top_n_indices <- order(abs(first_eigenvector), decreasing = TRUE)[1:n]

  # Set the initial positions in the positions matrix
  positions[, 1] <- 1  # Start tracking from the first eigenvector for each top entry

  # Loop through each cycle
  for (j in 1:cycles) {
    # Define the start and end of the sliding windows
    start_A <- (j - 1) * step_width + 1
    end_A <- start_A + window_size - 1

    # Compute the correlation matrix and eigen decomposition for the current window
    Data_A <- Dataset[start_A:end_A, ]
    eigen_A <- eigen(cor(Data_A), symmetric = TRUE)

    # Loop through each of the top n entries to track them separately
    for (k in 1:n) {
      # Current index to track for this entry
      index <- top_n_indices[k]

      # Get the eigenvector specified by positions[k, j]
      current_eigenvector <- eigen_A$vectors[, positions[k, j]]

      # Get the positions of the top 10 largest entries in absolute value
      top_10_indices <- order(abs(current_eigenvector), decreasing = TRUE)[1:10]

      # Check if 'index' is among the top 10 indices
      if (index %in% top_10_indices) {
        # If 'index' is in the top 10, retain the current eigenvector position
        positions[k, j + 1] <- positions[k, j]
      } else {
        # If not, search for the first eigenvector containing 'index' in its top 10
        found <- FALSE
        for (m in 1:ncol(eigen_A$vectors)) {
          top_10_m <- order(abs(eigen_A$vectors[, m]), decreasing = TRUE)[1:10]
          if (index %in% top_10_m) {
            positions[k, j + 1] <- m
            found <- TRUE
            break
          }
        }
        if (!found) {
          warning(paste("No eigenvector found with index", index, "in the top 10 entries for cycle", j))
          positions[k, j + 1] <- NA  # Mark as NA if not found
        }
      }
    }
  }

  return(positions)
}
```

```{r}
plot_tracking_heatmap <- function(positions_matrix) {
  # Convert the matrix to a data frame and add identifiers
  positions_df <- as.data.frame(positions_matrix)
  colnames(positions_df) <- paste0("Cycle_", 1:ncol(positions_df))
  positions_df$Entry <- paste0("Entry_", 1:nrow(positions_df))

  # Melt the data frame to long format
  positions_long <- melt(positions_df, id.vars = "Entry", variable.name = "Cycle", value.name = "Eigenvector")

  # Convert the Cycle column to a numeric value for plotting
  positions_long$Cycle <- as.numeric(gsub("Cycle_", "", positions_long$Cycle))

  # Plot the heatmap with a fixed color scale from 1 to 98 and red borders for values other than 1
  ggplot(positions_long, aes(x = Cycle, y = Entry, fill = Eigenvector)) +
    geom_tile(aes(color = ifelse(Eigenvector != 1, "red", NA)), size = 0.5) +
    scale_fill_gradient(low = "lightblue", high = "darkblue", na.value = "grey", limits = c(1, 98)) +
    scale_color_identity() +  # Use the specified color directly
    labs(title = "Tracking of Top Eigenvector Entries Across Cycles",
         x = "Cycle",
         y = "Top Entries in First Eigenvector",
         fill = "Eigenvector Index") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1),
          axis.text = element_text(size = 12),
          axis.title = element_text(size = 14),
          plot.title = element_text(size = 16, face = "bold"))
}
```