Skip to content

Commit d830eb3

Browse files
committed
Next Release
- Final for CRAN
1 parent 4340c27 commit d830eb3

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

60 files changed

+311
-218
lines changed

DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ Type: Package
22
Package: tidycells
33
Title: Read Tabular Data from Diverse Sources and Easily Make
44
Them Tidy
5-
Version: 0.2.0.99
5+
Version: 0.2.1
66
Authors@R:
77
person(given = "Indranil",
88
family = "Gayen",

R/collate_columns.R

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,13 +96,16 @@ collate_columns <- function(composed_data,
9696
if (length(dcl) == 1) {
9797
out_d <- dcl[[1]]
9898

99+
colnames(out_d) <- stringr::str_replace_all(colnames(out_d), "uncollated_", "old_uc_")
100+
colnames(out_d) <- stringr::str_replace_all(colnames(out_d), "collated_", "old_c_")
101+
99102
restcols <- setdiff(colnames(out_d), defcols_this)
100103
if (length(restcols) > 0) {
101104
cn_map_0 <- tibble(cn = restcols) %>%
102105
mutate(is_major = stringr::str_detect(tolower(cn), "major")) %>%
103106
arrange(cn) %>%
104-
mutate(sn = seq_along(cn), sn_m = sn + is_major * (10^10)) %>%
105-
arrange(desc(sn_m)) %>%
107+
mutate(sn = seq_along(cn), sn_m = sn - is_major * (10^10)) %>%
108+
arrange(sn_m) %>%
106109
mutate(fsn = seq_along(cn), new_cn = paste0("collated_", fsn)) %>%
107110
select(cn, new_cn)
108111

R/compose_cells.R

Lines changed: 67 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -49,51 +49,44 @@ compose_cells_raw <- function(ca, post_process = TRUE, attr_sep = " :: ",
4949
if (!inherits(ca, "cell_analysis")) {
5050
abort("A 'Cell Analysis' expected.")
5151
}
52-
52+
5353
dam <- ca$details$data_attr_map_raw
54-
54+
5555
dam <- dam %>%
5656
group_by(data_gid, direction_basic, direction_group) %>%
5757
mutate(dist_order = dist %>% as.factor() %>% as.integer()) %>%
5858
ungroup()
59-
59+
6060
dam <- dam %>%
6161
group_by(data_gid, attr_gid) %>%
6262
mutate(attr_gid_split_order = attr_gid_split %>% as.factor() %>% as.integer()) %>%
6363
ungroup()
64-
65-
fj_this <- function(x, y) {
66-
fj(x, y,
67-
join_by = c("row", "col", "value", "data_block"),
68-
sallow_join = TRUE, sep = attr_sep
69-
)
70-
}
71-
64+
7265
dcomp00 <- dam %>%
7366
group_by(data_gid) %>%
7467
group_split() %>%
7568
map(~ .x %>%
76-
group_by(attr_gid, direction, attr_gid_split) %>%
77-
group_split())
78-
69+
group_by(attr_gid, direction, attr_gid_split) %>%
70+
group_split())
71+
7972
dcomp0 <- dcomp00 %>%
8073
map(~ .x %>%
81-
# this try should be removed if unpivotr::enhead is internalized
82-
# or similar behaving fucntions is developed.
83-
map(~ {
84-
e <- try(stitch_direction(.x, ca$cell_df, trace_it = trace_it_back), silent = TRUE)
85-
.ok <- !inherits(e, "try-error")
86-
.d <- NULL
87-
if (!.ok) .d <- .x
88-
list(ok = .ok, out = e, dat = .d)
89-
}))
90-
74+
# this try should be removed if unpivotr::enhead is internalized
75+
# or similar behaving fucntions is developed.
76+
map(~ {
77+
e <- try(stitch_direction(.x, ca$cell_df, trace_it = trace_it_back), silent = TRUE)
78+
.ok <- !inherits(e, "try-error")
79+
.d <- NULL
80+
if (!.ok) .d <- .x
81+
list(ok = .ok, out = e, dat = .d)
82+
}))
83+
9184
chk0 <- dcomp0 %>%
9285
map_lgl(~ .x %>%
93-
map_lgl(~ !.x$ok) %>%
94-
any()) %>%
86+
map_lgl(~ !.x$ok) %>%
87+
any()) %>%
9588
any()
96-
89+
9790
if (chk0) {
9891
if (!silent) {
9992
# Need to show user what has been missed
@@ -110,65 +103,69 @@ compose_cells_raw <- function(ca, post_process = TRUE, attr_sep = " :: ",
110103
ok = "Yes", cancel = "No",
111104
is_question = TRUE
112105
)
113-
106+
114107
if (identical(user_res, TRUE)) {
115108
user_res <- "yes"
116109
}
117-
110+
118111
if (user_res == "yes") {
119112
# return failed analysis part for observing
120113
patched_ca <- ca
121-
114+
122115
dp0 <- dcomp0 %>% map_df(~ .x %>%
123-
map_lgl(~ !.x$ok) %>%
124-
.x[.] %>%
125-
map_df(~ .x$dat))
116+
map_lgl(~ !.x$ok) %>%
117+
.x[.] %>%
118+
map_df(~ .x$dat))
126119
patched_ca$details$data_attr_map_raw <- unique(dp0[colnames(patched_ca$details$data_attr_map_raw)])
127-
120+
128121
warn(paste0(
129122
"Failed portion of Cell-Analysis is returned",
130123
"\nIn the plots you should see texts, only in failed attributes."
131124
))
132-
125+
133126
return(patched_ca)
134127
}
135128
}
136129
}
137130
}
138-
131+
139132
dcomp0 <- dcomp0 %>% map(~ .x %>%
140-
map_lgl(~ .x$ok) %>%
141-
.x[.] %>%
142-
map(~ .x$out))
143-
133+
map_lgl(~ .x$ok) %>%
134+
.x[.] %>%
135+
map(~ .x$out))
136+
144137
chk1 <- dcomp0 %>%
145138
map_int(length) %>%
146139
sum()
147-
140+
148141
if (chk1 > 0) {
149-
dcomp <- dcomp0 %>% map(~ reduce(.x, fj_this))
142+
dcomp <- dcomp0 %>%
143+
map(~ reduce(.x, fj,
144+
join_by = c("row", "col", "value", "data_block"),
145+
sallow_join = TRUE, sep = attr_sep
146+
))
150147
} else {
151148
abort("Failed to compose")
152149
}
153-
154-
150+
151+
155152
if (print_col_info) {
156153
dlinf <- dcomp %>% map(get_all_col_representative, cut_th = 4, lower_it = FALSE)
157-
154+
158155
dlinfc <- dlinf %>% map(~ .x %>% purrr::imap_chr(~ paste0(" ", cli_bb(.y), "\n ", paste0(cli_g(.x), collapse = ", "))))
159156
names(dlinfc) <- paste0("data_block = ", seq_along(dlinfc))
160-
157+
161158
xmsg <- dlinfc %>%
162159
purrr::imap_chr(~ paste0(cli_br(.y), "\n", paste0(.x, collapse = "\n"))) %>%
163160
paste0(collapse = "\n")
164-
161+
165162
cat(xmsg)
166163
}
167-
164+
168165
if (!post_process) {
169166
return(invisible(dcomp))
170167
}
171-
168+
172169
compose_cells_raw_post_process(dcomp, details = details, discard_raw_cols = discard_raw_cols, attr_sep = attr_sep)
173170
}
174171

@@ -181,50 +178,50 @@ compose_cells_raw_post_process <- function(dcomp, details = FALSE, discard_raw_c
181178
cns <- cns %>% setdiff(cns_trace)
182179
cns_base <- c("row", "col", "data_block", "value")
183180
cns <- cns %>% setdiff(cns_base)
184-
181+
185182
cns_d <- tibble(cname = cns, cn = cns) %>%
186183
tidyr::separate(cn, into = c("ag", "rc", "dir", "ad", "d"))
187-
188-
184+
185+
189186
cns_d <- cns_d %>%
190187
# anticlockwise
191188
mutate(dir_n = recode(dir,
192-
top = 1,
193-
topLeft = 2,
194-
left = 3,
195-
bottomLeft = 4,
196-
bottom = 5,
197-
bottomRight = 6,
198-
right = 7,
199-
topRight = 8
189+
top = 1,
190+
topLeft = 2,
191+
left = 3,
192+
bottomLeft = 4,
193+
bottom = 5,
194+
bottomRight = 6,
195+
right = 7,
196+
topRight = 8
200197
)) %>%
201198
mutate(rc_n = recode(rc,
202-
row = 1,
203-
col = 2,
204-
corner = 3
199+
row = 1,
200+
col = 2,
201+
corner = 3
205202
)) %>%
206203
mutate(cname_ord = paste(rc_n, dir_n, ad, d, sep = "_"))
207-
208-
209-
204+
205+
206+
210207
dcomp_r <- dcomp %>%
211208
map(~ refine_cols(.x, cn_df = cns_d, sep = attr_sep)) %>%
212209
bind_rows()
213-
210+
214211
# add rc_df class
215212
class(dcomp_r) <- c(class(dcomp_r), "rc_df") %>% unique()
216-
213+
217214
this_cols <- colnames(dcomp_r)
218215
f_cols <- c("row", "col", "data_block", "value")
219216
this_cols <- this_cols %>% setdiff(f_cols)
220217
nm_cols <- this_cols[stringr::str_detect(this_cols, "row|col|corner")]
221218
m_cols <- this_cols %>% setdiff(nm_cols)
222-
219+
223220
if (details) {
224221
lo <- list(raw_data = dcomp_r, must_cols = f_cols, major_col = m_cols, minor_col = nm_cols)
225222
return(lo)
226223
}
227-
224+
228225
if (discard_raw_cols) {
229226
dcomp_r[c(f_cols, m_cols)]
230227
} else {

R/read_cells_stages.R

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,9 @@ do_collate <- function(at_level, this_level, out_l, simplify, simple) {
142142
dcl <- list(out_l$final_composition)
143143
}
144144

145-
out_l$final <- dcl %>% map_df(~ collate_columns(.x) %>% as_tibble())
145+
out_l$final <- dcl %>%
146+
map(~ collate_columns(.x, retain_cell_address = TRUE)) %>%
147+
collate_columns()
146148

147149
out_l$stage <- read_cell_task_orders[6]
148150
if (simplify) {

R/reduce_2dfs.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,11 @@ get_connected_cols <- function(col_map_with_dist) {
1313
}
1414

1515
reduce_2dfs <- function(dc1, dc2, combine_th = 0.6, rest_cols = Inf, retain_other_cols = FALSE) {
16-
colnames(dc1) <- stringr::str_replace_all(colnames(dc1), "collated_", "d1_old_c_")
1716
colnames(dc1) <- stringr::str_replace_all(colnames(dc1), "uncollated_", "d1_old_uc_")
17+
colnames(dc1) <- stringr::str_replace_all(colnames(dc1), "collated_", "d1_old_c_")
1818

19-
colnames(dc2) <- stringr::str_replace_all(colnames(dc2), "collated_", "d2_old_c_")
2019
colnames(dc2) <- stringr::str_replace_all(colnames(dc2), "uncollated_", "d2_old_uc_")
20+
colnames(dc2) <- stringr::str_replace_all(colnames(dc2), "collated_", "d2_old_c_")
2121

2222

2323
cr1 <- get_all_col_representative(dc1)

README.Rmd

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ To start with `tidycells`, I invite you to see `vignette("tidycells-intro")` or
120120
## Quick Overview
121121

122122
Let's take a quick look at an example data as given in
123+
123124
```{r, eval=FALSE}
124125
system.file("extdata", "marks.xlsx", package = "tidycells", mustWork = TRUE)
125126
```
@@ -133,6 +134,7 @@ knitr::include_graphics("vignettes/ext/marks.png")
133134
Let's try `tidycells` functions in this data
134135

135136
Read at once
137+
136138
```{r, eval=FALSE}
137139
# you should have tidyxl installed
138140
system.file("extdata", "marks.xlsx", package = "tidycells", mustWork = TRUE) %>%
@@ -161,13 +163,17 @@ d <- system.file("extdata", "marks.xlsx", package = "tidycells", mustWork = TRUE
161163
read_cells(at_level = "make_cells") %>%
162164
.[[1]]
163165
```
166+
164167
Or
168+
165169
```{r}
166170
# or you may do
167171
d <- system.file("extdata", "marks_cells.rds", package = "tidycells", mustWork = TRUE) %>%
168172
readRDS()
169173
```
174+
170175
Then
176+
171177
```{r}
172178
d <- numeric_values_classifier(d)
173179
da <- analyze_cells(d)
@@ -182,7 +188,9 @@ dc <- compose_cells(da, print_attribute_overview = TRUE)
182188
knitr::include_graphics("vignettes/ext/compose_cells_cli1.png")
183189
dc <- compose_cells(da)
184190
```
191+
185192
If you want a well-aligned columns then you may like to do
193+
186194
```{r}
187195
# bit tricky and tedious unless you do print_attribute_overview = TRUE in above line
188196
dcfine <- dc %>%
@@ -206,6 +214,7 @@ dcfine <- dc %>%
206214
```
207215

208216
`head(dcfine)` looks like
217+
209218
```{r, echo=FALSE}
210219
knitr::kable(head(dcfine), align = c(rep("l", 3), "c"))
211220
```
@@ -277,6 +286,8 @@ The `readabs` package helps you easily download, import, and tidy time series da
277286
Gives ability for choosing any rectangular data file using interactive GUI dialog box, and seamlessly manipulating tidy data between an 'Excel' window and R session.
278287
* The [tidyABS](https://github.com/ianmoran11/tidyABS) package:
279288
The `tidyABS` package converts ABS excel tables to tidy data frames. It uses rules-of-thumb to determine the structure of excel tables, however it sometimes requires pointers from the user. This package is in early development.
289+
* The [hypoparsr](https://github.com/tdoehmen/hypoparsr) package:
290+
This package takes a different approach to CSV parsing by creating different parsing hypotheses for a given file and ranking them based on data quality features.
280291

281292

282293
## Acknowledgement

README.md

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -182,8 +182,9 @@ After this you need to run `compose_cells` (with argument
182182
dc <- compose_cells(da, print_attribute_overview = TRUE)
183183
```
184184

185-
<img src="vignettes/ext/compose_cells_cli1.png" width="451px" /> If you
186-
want a well-aligned columns then you may like to
185+
<img src="vignettes/ext/compose_cells_cli1.png" width="451px" />
186+
187+
If you want a well-aligned columns then you may like to
187188
do
188189

189190
``` r
@@ -339,6 +340,10 @@ level only.
339340
uses rules-of-thumb to determine the structure of excel tables,
340341
however it sometimes requires pointers from the user. This package
341342
is in early development.
343+
- The [hypoparsr](https://github.com/tdoehmen/hypoparsr) package: This
344+
package takes a different approach to CSV parsing by creating
345+
different parsing hypotheses for a given file and ranking them based
346+
on data quality features.
342347

343348
## Acknowledgement
344349

0 commit comments

Comments
 (0)