Skip to content

Commit 5e119d8

Browse files
committed
Added backup script
1 parent 9a86785 commit 5e119d8

File tree

3 files changed

+228
-1
lines changed

3 files changed

+228
-1
lines changed

bin/backup.pl

Lines changed: 211 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,211 @@
1+
use strict;
2+
use warnings;
3+
use v5.36;
4+
5+
use feature qw< state >;
6+
use Getopt::Long;
7+
use MetaCPAN::Logger qw< :log :dlog >;
8+
use Cpanel::JSON::XS qw< decode_json encode_json >;
9+
use DateTime ();
10+
use IO::Zlib ();
11+
use Path::Tiny qw< path >;
12+
use Try::Tiny qw< catch try >;
13+
14+
use MetaCPAN::ES;
15+
use MetaCPAN::Ingest qw< home >;
16+
17+
# config
18+
19+
# args
20+
my $batch_size = 100;
21+
my $size = 1000;
22+
my $index = "cpan";
23+
24+
my ( $type, $purge, $dry_run, $restore );
25+
GetOptions(
26+
"batch_size=i" => \$batch_size,
27+
"purge" => \$purge,
28+
"dry_run" => \$dry_run,
29+
"size=i" => \$size,
30+
"index=s" => \$index,
31+
"type=s" => \$type,
32+
"restore=s" => \$restore,
33+
);
34+
35+
# setup
36+
my $home = path( home() );
37+
38+
run_restore() if $restore;
39+
run_purge() if $purge;
40+
run_backup() unless $restore or $purge;
41+
42+
1;
43+
44+
###
45+
46+
sub run_restore () {
47+
my $restore_path;
48+
$restore_path = path($restore);
49+
$restore_path->exists or die "$restore doesn't exist\n";
50+
51+
log_info { 'Restoring from ', $restore_path };
52+
53+
my @bulk;
54+
55+
my $fh = IO::Zlib->new( $restore_path->stringify, 'rb' );
56+
57+
my %es_store;
58+
my %bulk_store;
59+
60+
while ( my $line = $fh->readline ) {
61+
62+
state $line_count = 0;
63+
++$line_count;
64+
my $raw;
65+
66+
try { $raw = decode_json($line) }
67+
catch {
68+
log_warn {"cannot decode JSON: $line --- $&"};
69+
};
70+
71+
# Create our bulk_helper if we need,
72+
# incase a backup has mixed _index or _type
73+
# create a new bulk helper for each
74+
my $key = $raw->{_index} . $raw->{_type};
75+
76+
$es_store{$key} ||= MetaCPAN::ES->new(
77+
index => $raw->{_index},
78+
type => $raw->{_type},
79+
);
80+
my $es = $es_store{$key};
81+
82+
$bulk_store{$key} ||= $es->bulk( max_count => $batch_size );
83+
my $bulk = $bulk_store{$key};
84+
85+
my $parent = $raw->{fields}->{_parent};
86+
87+
if ( $raw->{_type} eq 'author' ) {
88+
89+
# Hack for dodgy lat / lon's
90+
if ( my $loc = $raw->{_source}->{location} ) {
91+
92+
my $lat = $loc->[1];
93+
my $lon = $loc->[0];
94+
95+
if ( $lat > 90 or $lat < -90 ) {
96+
97+
# Invalid latitude
98+
delete $raw->{_source}->{location};
99+
}
100+
elsif ( $lon > 180 or $lon < -180 ) {
101+
102+
# Invalid longitude
103+
delete $raw->{_source}->{location};
104+
}
105+
}
106+
}
107+
108+
if ( $es->exists( id => $raw->{_id} ) ) {
109+
110+
$bulk->update( {
111+
id => $raw->{_id},
112+
doc => $raw->{_source},
113+
doc_as_upsert => 1,
114+
} );
115+
116+
}
117+
else {
118+
119+
$bulk->create( {
120+
id => $raw->{_id},
121+
$parent ? ( parent => $parent ) : (),
122+
source => $raw->{_source},
123+
} );
124+
}
125+
}
126+
127+
# Flush anything left over just incase
128+
$_->index_refresh for values %es_store;
129+
$_->flush for values %bulk_store;
130+
131+
log_info {'done'};
132+
}
133+
134+
sub run_purge () {
135+
my $now = DateTime->now;
136+
my $backup = $home->child(qw< var backup >);
137+
138+
$backup->visit(
139+
sub {
140+
my $file = shift;
141+
return if $file->is_dir;
142+
143+
my $mtime = DateTime->from_epoch( epoch => $file->stat->mtime );
144+
145+
# keep a daily backup for one week
146+
return if $mtime > $now->clone->subtract( days => 7 );
147+
148+
# after that keep weekly backups
149+
if ( $mtime->clone->truncate( to => 'week' )
150+
!= $mtime->clone->truncate( to => 'day' ) )
151+
{
152+
log_info {"Removing old backup $file"};
153+
return log_info {'Not (dry run)'} if $dry_run;
154+
$file->remove;
155+
}
156+
},
157+
{ recurse => 1 }
158+
);
159+
}
160+
161+
sub run_backup {
162+
my $filename = join( '-',
163+
DateTime->now->strftime('%F'),
164+
grep {defined} $index, $type );
165+
166+
my $file = $home->child( qw< var backup >, "$filename.json.gz" );
167+
$file->parent->mkpath unless ( -e $file->parent );
168+
my $fh = IO::Zlib->new( "$file", 'wb4' );
169+
170+
my $es = MetaCPAN::ES->new(
171+
index => $index,
172+
( $type ? ( type => $type ) : () )
173+
);
174+
my $scroll = $es->scroll(
175+
size => $size,
176+
fields => [qw< _parent _source >],
177+
scroll => '1m',
178+
);
179+
180+
log_info { 'Backing up ', $scroll->total, ' documents' };
181+
182+
while ( my $result = $scroll->next ) {
183+
print $fh encode_json($result), $/;
184+
}
185+
186+
close $fh;
187+
log_info {'done'};
188+
}
189+
190+
__END__
191+
192+
=head1 NAME
193+
194+
MetaCPAN::Script::Backup - Backup indices and types
195+
196+
=head1 SYNOPSIS
197+
198+
$ bin/backup --index user --type account
199+
200+
$ bin/backup --purge
201+
202+
$ bin/backup --restore path
203+
204+
=head1 DESCRIPTION
205+
206+
Creates C<.json.gz> files in C<var/backup>. These files contain
207+
one record per line.
208+
209+
=head2 purge
210+
211+
Purges old backups. Backups from the current week are kept.

lib/MetaCPAN/ES.pm

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ sub new ( $class, %args ) {
2222
nodes => [$node],
2323
),
2424
index => $index,
25-
type => $args{type},
25+
( $args{type} ? ( type => $args{type} ) : () ),
2626
}, $class;
2727
}
2828

lib/MetaCPAN/Ingest.pm

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ use v5.36;
66

77
use Digest::SHA;
88
use Encode qw< decode_utf8 >;
9+
use IPC::Run3 ();
910
use LWP::UserAgent;
1011
use Path::Tiny qw< path >;
1112
use PAUSE::Permissions ();
@@ -27,6 +28,7 @@ use Sub::Exporter -setup => {
2728
extract_section
2829
fix_version
2930
handle_error
31+
home
3032
minion
3133
numify_version
3234
read_00whois
@@ -137,6 +139,20 @@ sub handle_error ( $error, $die_always ) {
137139
Carp::croak $error if $die_always;
138140
}
139141

142+
sub home () {
143+
IPC::Run3::run3(
144+
[ qw< git rev-parse --show-toplevel > ], # TODO: use alternative persistent path that's accessible from the container
145+
\undef, \my $stdout, \my $stderr
146+
);
147+
148+
die $stderr if ($?);
149+
150+
chomp $stdout;
151+
die "Failed to find git dir: '$stdout'" unless -d $stdout;
152+
153+
return $stdout;
154+
}
155+
140156
sub minion () {
141157
require 'Mojo::Server';
142158
return Mojo::Server->new->build_app('MetaCPAN::API')->minion;

0 commit comments

Comments
 (0)