Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
63521d9
Update dev version (Github Actions)
github-actions[bot] Sep 23, 2024
15f314a
validation functions added for ae_table_grade
BaptisteArchambaud Sep 23, 2024
b3c2fb8
Merge branch 'baptiste-validation' of https://github.com/Oncostat/grs…
BaptisteArchambaud Sep 23, 2024
2af87f8
Update dev version (Github Actions)
github-actions[bot] Sep 23, 2024
6a4b9e7
updating structure of AE_table_grade validation files
BaptisteArchambaud Oct 29, 2024
c39b6fd
Merge branch 'main' into baptiste-nusaibah-validation
BaptisteArchambaud Oct 30, 2024
7d743c5
Update outputs_ae_table_grade.R
BaptisteArchambaud Nov 4, 2024
e5d6c1a
ae_table_soc validation - formatting SAS and R outputs
BaptisteArchambaud Nov 6, 2024
99e08ff
compare_grade output table
NusaibahIbr Nov 19, 2024
dcb1467
sortie des 2 tables concordantes
NusaibahIbr Nov 21, 2024
e5447f8
create compare_soc (copy compare_grade)
NusaibahIbr Nov 21, 2024
d8c1d68
modification according AE output with SOC & PT
NusaibahIbr Nov 25, 2024
cc6e7fa
correction
NusaibahIbr Nov 25, 2024
60b5318
Nettoyer le script et compatibilité avec les tables avec bras de trt
NusaibahIbr Dec 26, 2024
e272d83
improve and finish compare_soc function
NusaibahIbr Dec 26, 2024
3d6074c
work in progress
NusaibahIbr Jan 2, 2025
09a1057
Merge branch 'main' into baptiste-nusaibah-validation
NusaibahIbr Jan 2, 2025
632809c
end to major modifications
NusaibahIbr Jan 16, 2025
eca9df5
formatting with flextable package
NusaibahIbr Feb 3, 2025
4115b8a
change to make the comparison more exhaustive
NusaibahIbr Apr 2, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
183 changes: 183 additions & 0 deletions R/compare_grade.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
# function that compares the summrised tables in R versus those in SAS
# output: if difference, dataframe with each row details of difference
# if no difference, dataframe displaying table R and table SAS side by side.
# author: Nusaibah


compare_grade <- function(tabR,tabSAS){

tab=list() # list containing all difference between the 2 tables
tabRraw=tabR
tabSASraw=tabSAS

#warning("Different number of grade levels")------------
if (nrow(tabR)!=nrow(tabSAS)){

if (nrow(tabR)<nrow(tabSAS)){
tab$grade=c(tab$grade,tabSAS[which(!(tabSAS$grade%in% tabR$grade)),"grade"]%>%pull)
tab$level=c(tab$level,rep("Mineur",length(tabSAS[which(!(tabSAS$grade%in% tabR$grade)),"grade"]%>%pull)))
tab$table=c(tab$table,rep("R",length(tabSAS[which(!(tabSAS$grade%in% tabR$grade)),"grade"]%>%pull)))
tab$main=c(tab$main,rep("Missing grade level",length(tabSAS[which(!(tabSAS$grade%in% tabR$grade)),"grade"]%>%pull)))
tab$valueR=c(tab$valueR,rep(NA,length(tabSAS[which(!(tabSAS$grade%in% tabR$grade)),"grade"]%>%pull)))
tab$valueSAS=c(tab$valueSAS,rep("Filled",length(tabSAS[which(!(tabSAS$grade%in% tabR$grade)),"grade"]%>%pull)))
tab$arm=c(tab$arm,rep("All",length(tabSAS[which(!(tabSAS$grade%in% tabR$grade)),"grade"]%>%pull)))

}else if (nrow(tabR)>nrow(tabSAS)){
tab$grade=c(tab$grade,tabR[which(!(tabR$grade%in% tabSAS$grade)),"grade"]%>%pull)
tab$level=c(tab$level,rep("Mineur",length(tabR[which(!(tabR$grade%in% tabSAS$grade)),"grade"]%>%pull)))
tab$table=c(tab$table,rep("SAS",length(tabR[which(!(tabR$grade%in% tabSAS$grade)),"grade"]%>%pull)))
tab$main=c(tab$main,rep("Missing grade level",length(tabR[which(!(tabR$grade%in% tabSAS$grade)),"grade"]%>%pull)))
tab$valueR=c(tab$valueR,rep("Filled",length(tabR[which(!(tabR$grade%in% tabSAS$grade)),"grade"]%>%pull)))
tab$valueSAS=c(tab$valueSAS,rep(NA,length(tabR[which(!(tabR$grade%in% tabSAS$grade)),"grade"]%>%pull)))
tab$arm=c(tab$arm,rep("All",length(tabR[which(!(tabR$grade%in% tabSAS$grade)),"grade"]%>%pull)))



}
}
# warning("Different number of arm")----------
if (ncol(tabR)!=ncol(tabSAS)){

if (ncol(tabR)<ncol(tabSAS)){
tab$level=c(tab$level,"Mineur")
tab$arm=c(tab$arm,paste(unique(str_extract(colnames(tabSAS)[which(!(colnames(tabSAS)%in% colnames(tabR)))],"[:digit:]")),collapse = " & "))
tab$table=c(tab$table,"R")
tab$main=c(tab$main,"Missing arm")
tab$valueR=c(tab$valueR,NA)
tab$valueSAS=c(tab$valueSAS,"Filled")
tab$grade=c(tab$grade,"All")


}else if (ncol(tabR)>ncol(tabSAS)){
tab$level=c(tab$level,"Mineur")
tab$arm=c(tab$arm,paste(unique(str_extract(colnames(tabR)[which(!(colnames(tabR)%in% colnames(tabSAS)))],"[:digit:]")),collapse = " & "))
tab$table=c(tab$table,"SAS")
tab$main=c(tab$main,"Missing arm")
tab$valueR=c(tab$valueR,"Filled")
tab$valueSAS=c(tab$valueSAS,NA)
tab$grade=c(tab$grade,"All")

}}

#compare the commun elements---------------
tabR=tabR[,colnames(tabR) %in% colnames(tabSAS)]
tabSAS=tabSAS[,colnames(tabSAS) %in% colnames(tabR)]

df=tabR%>%arrange(grade)%>%full_join(tabSAS,by="grade",suffix = c(".r",".sas"))


indice=which(df[,paste0(tabR%>%select(-grade)%>%colnames(),paste=".r")]!=df[,paste0(tabSAS%>%select(-grade)%>%colnames(),paste=".sas")],
arr.ind=TRUE) # difference numérique number or percentage
indice[,"col"]=indice[,"col"]+1 # parce qu'on avait retiré le grade

indice=rbind2(indice,
which(is.na(df),arr.ind = T)%>%as.data.frame())
# difference NA quand 2 grades ne se correspondent pas dans les 2 tables. je garde NA

if (nrow(indice)!=0){
for (i in 1: nrow(indice)){

if (is.na(df[indice[i,"row"],indice[i,"col"]]) & grepl(".r",colnames(df)[indice[i,"col"]])){

tab$level=c(tab$level,"Mineur")
tab$grade=c(tab$grade,df[indice[i,"row"],"grade"]%>%pull)
tab$table=c(tab$table,"R")
tab$main=c(tab$main,"Missing grade level")
tab$valueR=c(tab$valueR,NA)
if (indice[i,"col"]%%2==0){
tab$valueSAS=c(tab$valueSAS,
df[indice[i,"row"],indice[i,"col"]+2] %>%pull )
}else{
tab$valueSAS=c(tab$valueSAS,
paste0(df[indice[i,"row"],indice[i,"col"]+2] %>%pull ,"%"))
}
tab$arm = c(tab$arm,str_extract(string=colnames(df)[indice[i,"col"]],pattern = "[:digit:]"))


}else if (is.na(df[indice[i,"row"],indice[i,"col"]]) & grepl(".sas",colnames(df)[indice[i,"col"]])){

tab$level=c(tab$level,"Mineur")
tab$grade=c(tab$grade,df[indice[i,"row"],"grade"]%>%pull)
tab$table=c(tab$table,"SAS")
tab$main=c(tab$main,"Missing grade level")
if (indice[i,"col"]%%2==0){ #means number
tab$valueR=c(tab$valueR, df[indice[i,"row"],indice[i,"col"]-2] %>% pull)
}else{ #mean percentage
tab$valueR=c(tab$valueR, paste0(df[indice[i,"row"],indice[i,"col"]-2] %>% pull,"%"))
}
tab$valueSAS=c(tab$valueSAS,NA)
tab$arm = c(tab$arm,str_extract(string=colnames(df)[indice[i,"col"]],pattern = "[:digit:]"))

}#else

if(indice[i,"col"]%%2==0){

tab$level=c(tab$level,"MAJEUR")
tab$grade=c(tab$grade,df[indice[i,"row"],"grade"]%>%pull)
tab$table=c(tab$table,"Both")
tab$main=c(tab$main,"Different number")
tab$arm = c(tab$arm,str_extract(string=colnames(df)[indice[i,"col"]],pattern = "[:digit:]"))
if (grepl(".sas",colnames(df)[indice[i,"col"]])){
tab$valueR=c(tab$valueR,df[indice[i,"row"],indice[i,"col"]-2]%>%pull)
tab$valueSAS=c(tab$valueSAS,df[indice[i,"row"],indice[i,"col"]]%>%pull)
}else{
tab$valueR=c(tab$valueR,df[indice[i,"row"],indice[i,"col"]]%>%pull)
tab$valueSAS=c(tab$valueSAS,df[indice[i,"row"],indice[i,"col"]+2]%>%pull)
}
}else if (indice[i,"col"]%%2==1){

tab$level=c(tab$level,"MAJEUR")
tab$grade=c(tab$grade,df[indice[i,"row"],"grade"]%>%pull)
tab$table=c(tab$table,"Both")
tab$main=c(tab$main,"Different percentage")
tab$arm = c(tab$arm,str_extract(string=colnames(df)[indice[i,"col"]],pattern = "[:digit:]"))
if (grepl(".sas",colnames(df)[indice[i,"col"]])){
tab$valueR=c(tab$valueR,df[indice[i,"row"],indice[i,"col"]-2]%>%pull)
tab$valueSAS=c(tab$valueSAS,df[indice[i,"row"],indice[i,"col"]]%>%pull)
}else{
tab$valueR=c(tab$valueR,df[indice[i,"row"],indice[i,"col"]]%>%pull)
tab$valueSAS=c(tab$valueSAS,df[indice[i,"row"],indice[i,"col"]+2]%>%pull)
}

}


}}



if (nrow(tabRraw)==nrow(tabSASraw) & ncol(tabRraw)==ncol(tabSASraw) & nrow(indice)==0 ){
# warning("Comparison result: same outputs")

tablo = tabRraw%>%
full_join(tabSASraw,
by="grade",
suffix = c("_r","_sas"))%>%
flextable()%>%
add_footer_lines( "Comparison result: same outputs", top=TRUE) %>%
autofit() %>%
add_header_row( values = c("grade","R table", "SAS table"), colwidths = c(1,ncol(tabRraw)-1, ncol(tabSASraw)-1)) %>%
align(part="header", align="center",i=1)%>%
bold(part="header") %>%
# bold(i=~N1_r!=N1_sas,j=c(2,4))%>%
vline(j=ncol(tabRraw), part = "body") %>%

set_header_labels(values= c(colnames(tabRraw),colnames(tabSASraw)[-1])) %>%
merge_at(i=c(1,2),j=1, part = "header")%>%
align(part="body", align="center",j=1)
}else{
tablo=as.data.frame(tab)%>%
#distinct(level,grade,table,main,.keep_all = TRUE) %>%
flextable()%>%
bold(part="header") %>%
align(part="body", align="right",i=~str_detect(valueR,pattern="[:digit:]"),j="valueR")%>%
align(part="body", align="right",i=~str_detect(valueSAS,pattern="[:digit:]"),j="valueSAS")%>%
autofit() %>%
bg(i=~level=="MAJEUR",bg="#f06d4d")
}


return(tablo)
}


194 changes: 194 additions & 0 deletions R/compare_soc.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
# function that compares the summurised tables in R versus those in SAS
# output: if difference, dataframe with each row details of difference
# if no difference, dataframe displaying table R and table SAS side by side.
# author: Nusaibah

compare_soc <- function(tabR,tabSAS){

# ae_table_soc() affiche toutes les colonnes grade 1à 5 NA et total(option) (pour chaque bras si le bras est indiqué)
# supprimée les colonnes et lignes vides pour optimiser l'algorithme
tabR = tabR[,!apply(is.na(tabR), 2, all)]
tabR = tabR[!apply(is.na(tabR%>%select(-soc)), 1, all),]

tabRraw=tabR
tabSASraw=tabSAS
tab=list()

if ("term_"%in% c(colnames(tabR),colnames(tabSAS))){
tabR = tabR %>% arrange(soc,term_)
}else{
tabR = tabR %>% arrange(soc)
}

## warning("Different number of SOC")----------------
if (nrow(tabR)!=nrow(tabSAS)){

socdiff=table(tabR$soc) %>%
as.data.frame() %>%
full_join(table(tabSAS$soc) %>%
as.data.frame(),
by="Var1",suffix = c(".r",".sas")) %>%
filter(Freq.r !=Freq.sas | (is.na(Freq.r) |is.na(Freq.sas)))

if (nrow(tabR)<nrow(tabSAS)){

tab$level=c(tab$level,"Mineur")
tab$soc=c(tab$soc,paste(socdiff%>%filter(Freq.r<Freq.sas| is.na(Freq.r))%>%select(Var1)%>%pull,collapse = " & "))
tab$table=c(tab$table,"R")
if(any(grepl("level",colnames(tabR)))){
tab$arm=c(tab$arm,NA)}
if ("term_"%in% c(colnames(tabR),colnames(tabSAS))){
tab$term_=c(tab$term_,paste(tabSAS[which(!(tabSAS$term_%in% tabR$term_)),"term_"]%>%pull,collapse = " & "))
tab$main=c(tab$main,"Missing PT item")
}else{
tab$term_=c(tab$term_,NA)
tab$main=c(tab$main,"Missing SOC item")
}
tab$valueR=c(tab$valueR,NA)
tab$valueSAS=c(tab$valueSAS,"Filled")

}else if (nrow(tabR)>nrow(tabSAS)){


tab$level=c(tab$level,"Mineur")
tab$soc=c(tab$soc,paste(socdiff%>%filter(Freq.r>Freq.sas | is.na(Freq.sas))%>%select(Var1)%>%pull,collapse = " & "))
tab$table=c(tab$table,"SAS")
tab$grade=c(tab$grade,NA)
if(any(grepl("bras",colnames(tabR)))){
tab$arm=c(tab$arm,NA)}
if ("term_"%in% c(colnames(tabR),colnames(tabSAS))){
tab$term_=c(tab$term_,paste(tabR[which(!(tabR$term_%in% tabSAS$term_)),"term_"]%>%pull,collapse = " & "))
tab$main=c(tab$main,"Missing PT item")
}else{
tab$term_=c(tab$term_,NA)
tab$main=c(tab$main,"Missing SOC item")
}
tab$valueR=c(tab$valueR,"Filled")
tab$valueSAS=c(tab$valueSAS,NA)
}
}
## #warning("Different number of column (arm or grade)")--------------
if (ncol(tabR)!=ncol(tabSAS)){
if (ncol(tabR)<ncol(tabSAS)){

tab$level=c(tab$level,"Mineur")
tab$grade=c(tab$grade,paste(colnames(tabSAS)[-(which(colnames(tabSAS) %in% colnames(tabR)))],collapse=" & "))
tab$table=c(tab$table,"R")
tab$soc=c(tab$soc,NA)
tab$main=c(tab$main,"Missing grade")
tab$valueR=c(tab$valueR,NA)
tab$valueSAS=c(tab$valueSAS,"Filled")
tab$term_=c(tab$term_,NA)

if(any(grepl("bras",colnames(tabR)))){
tab$arm=c(tab$arm,paste(unique(str_extract(colnames(tabSAS)[which(!(colnames(tabSAS)%in% colnames(tabR)))],"bras[:alnum:]+")),collapse = " & "))
}
}else if (ncol(tabR)>ncol(tabSAS)){

tab$level=c(tab$level,"Mineur")
tab$soc=c(tab$soc,NA)
tab$grade=c(tab$grade,paste(colnames(tabR)[-(which(colnames(tabR) %in% colnames(tabSAS)))],collapse = " & "))
if(any(grepl("bras",colnames(tabR)))){
tab$arm=c(tab$arm,paste(unique(str_extract(colnames(tabR)[which(!(colnames(tabR)%in% colnames(tabSAS)))],"bras[:alnum:]+")),collapse = " & "))
}
tab$table=c(tab$table,"SAS")
tab$main=c(tab$main,"Missing grade")
tab$valueR=c(tab$valueR,"Filled")
tab$valueSAS=c(tab$valueSAS,NA)
tab$term_=c(tab$term_,NA)

}
}
## commun element validation ------------
if ("term_"%in% c(colnames(tabR),colnames(tabSAS))){
dfsas=tabSAS%>%
pivot_longer(-c("soc","term_"),names_to = "grade",values_to = "count")%>%
mutate(table="SAS")


df=tabR%>%arrange(soc,term_)%>%
pivot_longer(-c("soc","term_"),names_to = "grade",values_to = "count")%>%
mutate(table="R")%>%
full_join(dfsas, #instead of full_join
by=c("soc","term_","grade"),
suffix = c(".r",".sas"))


if (any(grepl("bras",df$grade))){

df=df%>%separate(col="grade",into=c("arm","grade"),sep="_")} #instead of _

indice= df%>%filter(count.r!=count.sas | is.na(count.r)!=is.na(count.sas))


}else{
df=tabR%>%arrange(soc)%>%
pivot_longer(-c("soc"),names_to = "grade",values_to = "count")%>%
mutate(table="R")%>%
full_join(tabSAS%>% #instead of full
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ce full join émet un warning :

Avis dans full_join(., tabSAS %>% pivot_longer(-c("soc"), names_to = "grade",  :
  Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 1 of `x` matches multiple rows in `y`.
ℹ Row 1 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to
  silence this warning.

Il est émit quand term est renseigné, et je crois qu'il fautdrait écrire pivot_longer(-any_of(c("soc", "term"), ...), ou quelque chose du genre (et dans by aussi).
J'ai l'impression que le problème d'output vient de là.

pivot_longer(-c("soc"),names_to = "grade",values_to = "count")%>%
mutate(table="SAS"),
by=c("soc","grade"),
suffix = c(".r",".sas"))
if (any(grepl("bras",df$grade))){ df=df%>%mutate("arm"=str_extract(grade,"bras[:alnum:]+"))}
indice=df[which(df$count.r!=df$count.sas | (is.na(df$count.r) & !is.na(df$count.sas)) |
(!is.na(df$count.r) & is.na(df$count.sas))
,arr.ind = T),]
}

if (nrow(indice)!=0){

tab =rbind.fill(as.data.frame(tab),indice%>%
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

La fonction rbind.fill() vient du package plyr qui n'est pas importé dans grstat et ne devrait vraiment plus être utilisé aujourd'hui (package non maintenu). Je te conseillerais même de désinstaller plyr pour éviter de l'utiliser par erreur. Je crois qu'on peut remplacer par bind_rows().

mutate(level="MAJEUR",
table="Both",
main="Different value")%>%
dplyr::rename(valueR="count.r",
valueSAS="count.sas")%>%
select(-c("table.r","table.sas")))
}


if (nrow(tabRraw)==nrow(tabSASraw) & ncol(tabRraw)==ncol(tabSASraw) & nrow(indice)==0 ){
# warning("Comparison result: same outputs")

if ("term_" %in% colnames(tabRraw)){ jointure=c("soc","term_")
colsize=c(1,1,ncol(tabRraw)-2,ncol(tabSASraw)-2)}else{jointure="soc"
colsize=c(1,ncol(tabRraw)-1,ncol(tabSASraw)-1) }
tab = tabRraw%>%
full_join(tabSASraw,
by=jointure,
suffix = c("_r","_sas"))%>%
flextable()%>%
add_footer_lines( "Comparison result: same outputs", top=TRUE) %>%
autofit() %>%
add_header_row( values = c(jointure,"R table", "SAS table"), colwidths = colsize) %>%
align(part="header", align="center",i=1)%>%
bold(part="header") %>%
# bold(i=~N1_r!=N1_sas,j=c(2,4))%>%
vline(j=ncol(tabRraw), part = "body") %>%

set_header_labels(values= c(colnames(tabRraw),colnames(tabSASraw)[-c(1:length(jointure))])) %>%
merge_at(i=c(1,2),j=1, part = "header")%>%
align(part="body", align="left",j=1)

if ("term_" %in% colnames(tabRraw)){
tab = tab %>%
merge_at(i=c(1,2), j=2, part = "header")%>%
align(part="body", align="left",j=2)}
}else{
tab=as.data.frame(tab)%>%
# distinct(level,grade,table,main,soc,.keep_all = TRUE)%>%
flextable()%>%
bold(part="header") %>%
align(part="body", align="right",i=~str_detect(valueR,pattern="[:digit:]"),j="valueR")%>%
align(part="body", align="right",i=~str_detect(valueSAS,pattern="[:digit:]"),j="valueSAS")%>%
autofit() %>%
bg(i=~level=="MAJEUR",bg="#f06d4d")
}



return(tab)


}
Loading