2
2
library(tidyverse )
3
3
source(' functions.R' )
4
4
5
+ # GET COMPANIES HOUSE LIVE LIST----
6
+
5
7
# current companies live list with geolocation / local authorities added
6
8
ch <- readRDS(' local/companieshouse_livelist_geocoded.rds' )
7
9
# pryr::object_size(ch) #3.4gb in memory
@@ -19,6 +21,122 @@ extract.locs <- list.files('local/account_extracts', full.names = T)
19
21
extracts <- extract.locs %> % map(readRDS ) %> % bind_rows
20
22
# pryr::object_size(extracts) 732.65mb in memory
21
23
24
+ # Drop duplicate submitted accounts (they'll have the same date / be exactly the same)
25
+ # Account code is the date of submission
26
+ extracts <- extracts %> %
27
+ distinct(accountcode , companynumber , .keep_all = T )
28
+
29
+ # Format date
30
+ extracts <- extracts %> %
31
+ mutate(
32
+ enddate_formatted = lubridate :: parse_date_time(
33
+ enddate ,
34
+ orders = c(" dmy" ," ymd" ," mdy" ))
35
+ )
36
+
37
+
38
+
39
+ # GET AND PROCESS EXTRACTED ACCOUNTS DATA----
22
40
23
41
# For now, just going to keep the latest accounts
24
- # Firms with multiple accounts up to 8, have clearly submitted backdated ones this year
42
+ # Firms with multiple accounts up to 8, have clearly submitted backdated ones this year
43
+ # See https://github.com/DanOlner/companieshouseopen/blob/f2d59df75617905b2522c9be483bfb4db4f1fc5d/testcode/tests_n_checks_companieshousedata.R#L74
44
+
45
+ # For firms with multiple accounts, keep most recent date
46
+ # x <- extracts %>%
47
+ # group_by(companynumber) %>%
48
+ # filter(enddate_formatted == max(enddate_formatted)) %>%
49
+ # ungroup()
50
+
51
+ # Version applying to all companies is very slow
52
+ # Let's split and find newest date just for those with more than one set of accounts
53
+ # (Having filtered out the odd multiple submission of the same accounts above)
54
+ account.count <- extracts %> %
55
+ group_by(companynumber ) %> %
56
+ summarise(count = n()) %> %
57
+ ungroup()
58
+
59
+ table(account.count $ count )
60
+
61
+ # Keep only multiples to pick most decent accounts date
62
+ multiples <- extracts %> % filter(companynumber %in% account.count $ companynumber [account.count $ count != 1 ]) %> %
63
+ arrange(companynumber )
64
+
65
+ mostrecent <- multiples %> %
66
+ group_by(companynumber ) %> %
67
+ filter(enddate_formatted == max(enddate_formatted )) %> %
68
+ ungroup()
69
+
70
+ # Tick, one row per company
71
+ length(unique(mostrecent $ companynumber ))
72
+
73
+ # Some duplicates still
74
+ # Presumably must be some firms that submitted accounts with the same date?
75
+ # mostrecent.count <- mostrecent %>%
76
+ # group_by(companynumber) %>%
77
+ # summarise(count = n()) %>%
78
+ # ungroup()
79
+ #
80
+ # table(mostrecent.count$count)
81
+ #
82
+ # mostrecent %>% filter(companynumber %in% mostrecent.count$companynumber[mostrecent.count$count > 1]) %>% View
83
+
84
+ # Ah that one's easy to filter out duplicates - the account code is the same... doing above
85
+
86
+
87
+ # combine again
88
+ extracts.singleaccounts <- bind_rows(
89
+ mostrecent ,
90
+ extracts %> % filter(companynumber %in% account.count $ companynumber [account.count $ count == 1 ])
91
+ )
92
+
93
+ # Confirm, should be one company per row... TICK
94
+ length(unique(extracts.singleaccounts $ companynumber ))
95
+
96
+
97
+ # Only need the one date column now
98
+ extracts.singleaccounts <- extracts.singleaccounts %> %
99
+ select(- enddate ) %> %
100
+ rename(enddate = enddate_formatted )
101
+
102
+
103
+ # CHECK MATCH BETWEEN LIVE LIST AND EXTRACTED ACCOUNTS----
104
+
105
+ # Check match on companies house number... 8.2% with no match in the live list
106
+ table(extracts.singleaccounts $ companynumber %in% ch $ CompanyNumber )
107
+ table(extracts.singleaccounts $ companynumber %in% ch $ CompanyNumber ) %> % prop.table() * 100
108
+
109
+ # Look at the falses - what accounts are those? Many NI - postcodes are GB only so that makes sense
110
+ # A lot of others are dissolved - no longer live
111
+ extracts.singleaccounts %> % filter(! companynumber %in% ch $ CompanyNumber ) %> % View
112
+
113
+
114
+
115
+
116
+ # LINK LIVE LIST AND ACCOUNTS EXTRACTS----
117
+
118
+ # Currently ~3.2 million businesses
119
+ # Keep as sf object with point geometry for locations
120
+ both <- ch %> %
121
+ inner_join(
122
+ extracts.singleaccounts ,
123
+ by = c(' CompanyNumber' = ' companynumber' )
124
+ )
125
+
126
+
127
+
128
+
129
+
130
+
131
+
132
+
133
+
134
+
135
+ # 2.4gb
136
+ pryr :: object_size(both )
137
+
138
+ saveRDS(both , ' local/accountextracts_n_livelist_geocoded_combined.rds' )
139
+
140
+
141
+
142
+
0 commit comments