Skip to content

Commit ab2d137

Browse files
committed
Account extracts linked to live list
1 parent f2d59df commit ab2d137

File tree

2 files changed

+149
-2
lines changed

2 files changed

+149
-2
lines changed

testcode/tests_n_checks_companieshousedata.R

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ table(ch$CompanyStatus)
5555
#91.3% active
5656
table(ch$CompanyStatus) %>% prop.table() * 100
5757

58+
table(ch$CompanyCategory)
59+
5860
#Then there's...
5961
#Which just means "no sig transactions in the last year"
6062
#https://www.gov.uk/dormant-company/dormant-for-companies-house
@@ -132,7 +134,7 @@ g(x)
132134

133135

134136

135-
#CHECK ON AVAILABLE DATA IN IXBRL FORMAT IN ACCOUNTS----
137+
# CHECK ON AVAILABLE DATA IN IXBRL FORMAT IN ACCOUNTS----
136138

137139
#Will vary, so need to extract from a few accounts and see what's there.
138140
#How many match what's listed in the URI guide here?
@@ -146,10 +148,37 @@ g(x)
146148

147149

148150

151+
# EXAMINE COMBINED LIVE LIST / ACCOUNT EXTRACTS FOR EMPLOYEE NUMBER----
152+
153+
both <- readRDS('local/accountextracts_n_livelist_geocoded_combined.rds')
154+
155+
#Count of firms per LA
156+
both %>%
157+
st_set_geometry(NULL) %>%
158+
group_by(localauthority_name) %>%
159+
summarise(n()) %>%
160+
View
161+
162+
#Join SIC lookup to first SIC code
163+
both <- both %>%
164+
mutate(
165+
SIC_5DIGIT_CODE = substr(SICCode.SicText_1,1,5)
166+
) %>%
167+
left_join(
168+
read_csv('https://github.com/DanOlner/ukcompare/raw/master/data/SIClookup.csv'),
169+
by = 'SIC_5DIGIT_CODE'
170+
) %>%
171+
relocate(SIC_5DIGIT_NAME, .after = SIC_5DIGIT_CODE) %>%
172+
relocate(SIC_2DIGIT_CODE_NUMERIC, .after = SIC_2DIGIT_CODE)
149173

174+
#drop all dormant. From 3255134 to 3173230
175+
both <- both %>% filter(SIC_5DIGIT_CODE!="99999")
150176

151177

178+
#Let's make map of manufacturing
179+
if (!dir.exists('local/qgis')) dir.create('local/qgis')
152180

181+
st_write('local/')
153182

154183

155184

wrangling/combine_account_extracts_n_geocode.R

Lines changed: 119 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
library(tidyverse)
33
source('functions.R')
44

5+
# GET COMPANIES HOUSE LIVE LIST----
6+
57
#current companies live list with geolocation / local authorities added
68
ch <- readRDS('local/companieshouse_livelist_geocoded.rds')
79
#pryr::object_size(ch) #3.4gb in memory
@@ -19,6 +21,122 @@ extract.locs <- list.files('local/account_extracts', full.names = T)
1921
extracts <- extract.locs %>% map(readRDS) %>% bind_rows
2022
#pryr::object_size(extracts) 732.65mb in memory
2123

24+
#Drop duplicate submitted accounts (they'll have the same date / be exactly the same)
25+
#Account code is the date of submission
26+
extracts <- extracts %>%
27+
distinct(accountcode, companynumber, .keep_all = T)
28+
29+
#Format date
30+
extracts <- extracts %>%
31+
mutate(
32+
enddate_formatted = lubridate::parse_date_time(
33+
enddate,
34+
orders = c("dmy","ymd","mdy"))
35+
)
36+
37+
38+
39+
# GET AND PROCESS EXTRACTED ACCOUNTS DATA----
2240

2341
#For now, just going to keep the latest accounts
24-
#Firms with multiple accounts up to 8, have clearly submitted backdated ones this year
42+
#Firms with multiple accounts up to 8, have clearly submitted backdated ones this year
43+
#See https://github.com/DanOlner/companieshouseopen/blob/f2d59df75617905b2522c9be483bfb4db4f1fc5d/testcode/tests_n_checks_companieshousedata.R#L74
44+
45+
#For firms with multiple accounts, keep most recent date
46+
# x <- extracts %>%
47+
# group_by(companynumber) %>%
48+
# filter(enddate_formatted == max(enddate_formatted)) %>%
49+
# ungroup()
50+
51+
#Version applying to all companies is very slow
52+
#Let's split and find newest date just for those with more than one set of accounts
53+
#(Having filtered out the odd multiple submission of the same accounts above)
54+
account.count <- extracts %>%
55+
group_by(companynumber) %>%
56+
summarise(count = n()) %>%
57+
ungroup()
58+
59+
table(account.count$count)
60+
61+
#Keep only multiples to pick most decent accounts date
62+
multiples <- extracts %>% filter(companynumber %in% account.count$companynumber[account.count$count != 1]) %>%
63+
arrange(companynumber)
64+
65+
mostrecent <- multiples %>%
66+
group_by(companynumber) %>%
67+
filter(enddate_formatted == max(enddate_formatted)) %>%
68+
ungroup()
69+
70+
#Tick, one row per company
71+
length(unique(mostrecent$companynumber))
72+
73+
#Some duplicates still
74+
#Presumably must be some firms that submitted accounts with the same date?
75+
# mostrecent.count <- mostrecent %>%
76+
# group_by(companynumber) %>%
77+
# summarise(count = n()) %>%
78+
# ungroup()
79+
#
80+
# table(mostrecent.count$count)
81+
#
82+
# mostrecent %>% filter(companynumber %in% mostrecent.count$companynumber[mostrecent.count$count > 1]) %>% View
83+
84+
#Ah that one's easy to filter out duplicates - the account code is the same... doing above
85+
86+
87+
#combine again
88+
extracts.singleaccounts <- bind_rows(
89+
mostrecent,
90+
extracts %>% filter(companynumber %in% account.count$companynumber[account.count$count == 1])
91+
)
92+
93+
#Confirm, should be one company per row... TICK
94+
length(unique(extracts.singleaccounts$companynumber))
95+
96+
97+
#Only need the one date column now
98+
extracts.singleaccounts <- extracts.singleaccounts %>%
99+
select(-enddate) %>%
100+
rename(enddate = enddate_formatted)
101+
102+
103+
# CHECK MATCH BETWEEN LIVE LIST AND EXTRACTED ACCOUNTS----
104+
105+
#Check match on companies house number... 8.2% with no match in the live list
106+
table(extracts.singleaccounts$companynumber %in% ch$CompanyNumber)
107+
table(extracts.singleaccounts$companynumber %in% ch$CompanyNumber) %>% prop.table() * 100
108+
109+
#Look at the falses - what accounts are those? Many NI - postcodes are GB only so that makes sense
110+
#A lot of others are dissolved - no longer live
111+
extracts.singleaccounts %>% filter(!companynumber %in% ch$CompanyNumber) %>% View
112+
113+
114+
115+
116+
# LINK LIVE LIST AND ACCOUNTS EXTRACTS----
117+
118+
#Currently ~3.2 million businesses
119+
#Keep as sf object with point geometry for locations
120+
both <- ch %>%
121+
inner_join(
122+
extracts.singleaccounts,
123+
by = c('CompanyNumber' = 'companynumber')
124+
)
125+
126+
127+
128+
129+
130+
131+
132+
133+
134+
135+
#2.4gb
136+
pryr::object_size(both)
137+
138+
saveRDS(both, 'local/accountextracts_n_livelist_geocoded_combined.rds')
139+
140+
141+
142+

0 commit comments

Comments
 (0)