|
| 1 | +use strict; |
| 2 | +use warnings; |
| 3 | +use v5.36; |
| 4 | + |
| 5 | +use feature qw< state >; |
| 6 | +use Getopt::Long; |
| 7 | +use MetaCPAN::Logger qw< :log :dlog >; |
| 8 | +use Cpanel::JSON::XS qw< decode_json encode_json >; |
| 9 | +use DateTime (); |
| 10 | +use IO::Zlib (); |
| 11 | +use Path::Tiny qw< path >; |
| 12 | +use Try::Tiny qw< catch try >; |
| 13 | + |
| 14 | +use MetaCPAN::ES; |
| 15 | +use MetaCPAN::Ingest qw< home >; |
| 16 | + |
| 17 | +# config |
| 18 | + |
| 19 | +# args |
| 20 | +my $batch_size = 100; |
| 21 | +my $size = 1000; |
| 22 | +my $index = "cpan"; |
| 23 | + |
| 24 | +my ( $type, $purge, $dry_run, $restore ); |
| 25 | +GetOptions( |
| 26 | + "batch_size=i" => \$batch_size, |
| 27 | + "purge" => \$purge, |
| 28 | + "dry_run" => \$dry_run, |
| 29 | + "size=i" => \$size, |
| 30 | + "index=s" => \$index, |
| 31 | + "type=s" => \$type, |
| 32 | + "restore=s" => \$restore, |
| 33 | +); |
| 34 | + |
| 35 | +# setup |
| 36 | +my $home = path( home() ); |
| 37 | + |
| 38 | +run_restore() if $restore; |
| 39 | +run_purge() if $purge; |
| 40 | +run_backup() unless $restore or $purge; |
| 41 | + |
| 42 | +1; |
| 43 | + |
| 44 | +### |
| 45 | + |
| 46 | +sub run_restore () { |
| 47 | + my $restore_path; |
| 48 | + $restore_path = path($restore); |
| 49 | + $restore_path->exists or die "$restore doesn't exist\n"; |
| 50 | + |
| 51 | + log_info { 'Restoring from ', $restore_path }; |
| 52 | + |
| 53 | + my @bulk; |
| 54 | + |
| 55 | + my $fh = IO::Zlib->new( $restore_path->stringify, 'rb' ); |
| 56 | + |
| 57 | + my %es_store; |
| 58 | + my %bulk_store; |
| 59 | + |
| 60 | + while ( my $line = $fh->readline ) { |
| 61 | + |
| 62 | + state $line_count = 0; |
| 63 | + ++$line_count; |
| 64 | + my $raw; |
| 65 | + |
| 66 | + try { $raw = decode_json($line) } |
| 67 | + catch { |
| 68 | + log_warn {"cannot decode JSON: $line --- $&"}; |
| 69 | + }; |
| 70 | + |
| 71 | + # Create our bulk_helper if we need, |
| 72 | + # incase a backup has mixed _index or _type |
| 73 | + # create a new bulk helper for each |
| 74 | + my $key = $raw->{_index} . $raw->{_type}; |
| 75 | + |
| 76 | + $es_store{$key} ||= MetaCPAN::ES->new( |
| 77 | + index => $raw->{_index}, |
| 78 | + type => $raw->{_type}, |
| 79 | + ); |
| 80 | + my $es = $es_store{$key}; |
| 81 | + |
| 82 | + $bulk_store{$key} ||= $es->bulk( max_count => $batch_size ); |
| 83 | + my $bulk = $bulk_store{$key}; |
| 84 | + |
| 85 | + my $parent = $raw->{fields}->{_parent}; |
| 86 | + |
| 87 | + if ( $raw->{_type} eq 'author' ) { |
| 88 | + |
| 89 | + # Hack for dodgy lat / lon's |
| 90 | + if ( my $loc = $raw->{_source}->{location} ) { |
| 91 | + |
| 92 | + my $lat = $loc->[1]; |
| 93 | + my $lon = $loc->[0]; |
| 94 | + |
| 95 | + if ( $lat > 90 or $lat < -90 ) { |
| 96 | + |
| 97 | + # Invalid latitude |
| 98 | + delete $raw->{_source}->{location}; |
| 99 | + } |
| 100 | + elsif ( $lon > 180 or $lon < -180 ) { |
| 101 | + |
| 102 | + # Invalid longitude |
| 103 | + delete $raw->{_source}->{location}; |
| 104 | + } |
| 105 | + } |
| 106 | + } |
| 107 | + |
| 108 | + if ( $es->exists( id => $raw->{_id} ) ) { |
| 109 | + |
| 110 | + $bulk->update( { |
| 111 | + id => $raw->{_id}, |
| 112 | + doc => $raw->{_source}, |
| 113 | + doc_as_upsert => 1, |
| 114 | + } ); |
| 115 | + |
| 116 | + } |
| 117 | + else { |
| 118 | + |
| 119 | + $bulk->create( { |
| 120 | + id => $raw->{_id}, |
| 121 | + $parent ? ( parent => $parent ) : (), |
| 122 | + source => $raw->{_source}, |
| 123 | + } ); |
| 124 | + } |
| 125 | + } |
| 126 | + |
| 127 | + # Flush anything left over just incase |
| 128 | + $_->index_refresh for values %es_store; |
| 129 | + $_->flush for values %bulk_store; |
| 130 | + |
| 131 | + log_info {'done'}; |
| 132 | +} |
| 133 | + |
| 134 | +sub run_purge () { |
| 135 | + my $now = DateTime->now; |
| 136 | + my $backup = $home->child(qw< var backup >); |
| 137 | + |
| 138 | + $backup->visit( |
| 139 | + sub { |
| 140 | + my $file = shift; |
| 141 | + return if $file->is_dir; |
| 142 | + |
| 143 | + my $mtime = DateTime->from_epoch( epoch => $file->stat->mtime ); |
| 144 | + |
| 145 | + # keep a daily backup for one week |
| 146 | + return if $mtime > $now->clone->subtract( days => 7 ); |
| 147 | + |
| 148 | + # after that keep weekly backups |
| 149 | + if ( $mtime->clone->truncate( to => 'week' ) |
| 150 | + != $mtime->clone->truncate( to => 'day' ) ) |
| 151 | + { |
| 152 | + log_info {"Removing old backup $file"}; |
| 153 | + return log_info {'Not (dry run)'} if $dry_run; |
| 154 | + $file->remove; |
| 155 | + } |
| 156 | + }, |
| 157 | + { recurse => 1 } |
| 158 | + ); |
| 159 | +} |
| 160 | + |
| 161 | +sub run_backup { |
| 162 | + my $filename = join( '-', |
| 163 | + DateTime->now->strftime('%F'), |
| 164 | + grep {defined} $index, $type ); |
| 165 | + |
| 166 | + my $file = $home->child( qw< var backup >, "$filename.json.gz" ); |
| 167 | + $file->parent->mkpath unless ( -e $file->parent ); |
| 168 | + my $fh = IO::Zlib->new( "$file", 'wb4' ); |
| 169 | + |
| 170 | + my $es = MetaCPAN::ES->new( |
| 171 | + index => $index, |
| 172 | + ( $type ? ( type => $type ) : () ) |
| 173 | + ); |
| 174 | + my $scroll = $es->scroll( |
| 175 | + size => $size, |
| 176 | + fields => [qw< _parent _source >], |
| 177 | + scroll => '1m', |
| 178 | + ); |
| 179 | + |
| 180 | + log_info { 'Backing up ', $scroll->total, ' documents' }; |
| 181 | + |
| 182 | + while ( my $result = $scroll->next ) { |
| 183 | + print $fh encode_json($result), $/; |
| 184 | + } |
| 185 | + |
| 186 | + close $fh; |
| 187 | + log_info {'done'}; |
| 188 | +} |
| 189 | + |
| 190 | +__END__ |
| 191 | +
|
| 192 | +=head1 NAME |
| 193 | +
|
| 194 | +MetaCPAN::Script::Backup - Backup indices and types |
| 195 | +
|
| 196 | +=head1 SYNOPSIS |
| 197 | +
|
| 198 | + $ bin/backup --index user --type account |
| 199 | +
|
| 200 | + $ bin/backup --purge |
| 201 | +
|
| 202 | + $ bin/backup --restore path |
| 203 | +
|
| 204 | +=head1 DESCRIPTION |
| 205 | +
|
| 206 | +Creates C<.json.gz> files in C<var/backup>. These files contain |
| 207 | +one record per line. |
| 208 | +
|
| 209 | +=head2 purge |
| 210 | +
|
| 211 | +Purges old backups. Backups from the current week are kept. |
0 commit comments