@@ -2,61 +2,191 @@ package MetaCPAN::Script::Role::Contributor;
2
2
3
3
use Moose::Role;
4
4
5
+ use Log::Contextual qw( :log ) ;
5
6
use MetaCPAN::ESConfig qw( es_doc_path ) ;
6
- use MetaCPAN::Util qw( digest true false ) ;
7
+ use MetaCPAN::Util qw( true false ) ;
7
8
use Ref::Util qw( is_arrayref ) ;
8
9
9
- sub get_cpan_author_contributors {
10
- my ( $self , $author , $release , $distribution ) = @_ ;
11
- my @ret ;
12
- my $es = $self -> es;
13
-
14
- my $type = $self -> model-> doc(' release' );
15
- my $data ;
16
- eval {
17
- $data = $type -> get_contributors( $author , $release );
18
- 1;
19
- } or return [];
20
-
21
- for my $d ( @{ $data -> {contributors } } ) {
22
- next unless exists $d -> {pauseid };
23
-
24
- # skip existing records
25
- my $id = digest( $d -> {pauseid }, $release );
26
- my $exists = $es -> exists ( es_doc_path(' contributor' ), id => $id , );
27
- next if $exists ;
28
-
29
- $d -> {release_author } = $author ;
30
- $d -> {release_name } = $release ;
31
- $d -> {distribution } = $distribution ;
32
- push @ret , $d ;
10
+ sub update_contributors {
11
+ my ( $self , $query ) = @_ ;
12
+
13
+ my $scroll = $self -> es-> scroll_helper(
14
+ es_doc_path(' release' ),
15
+ body => {
16
+ query => $query ,
17
+ sort => [' _doc' ],
18
+ _source => [ qw<
19
+ name
20
+ author
21
+ distribution
22
+ metadata.author
23
+ metadata.x_contributors
24
+ > ],
25
+ },
26
+ );
27
+
28
+ my $bulk = $self -> es-> bulk_helper( es_doc_path(' contributor' ) );
29
+
30
+ while ( my $release = $scroll -> next ) {
31
+ log_debug { ' updating contributors for ' . $release -> {_source }{name } };
32
+ my $actions = $self -> release_contributor_update_actions(
33
+ $release -> {_source } );
34
+ for my $action (@$actions ) {
35
+ $bulk -> add_action(%$action );
36
+ }
33
37
}
34
38
35
- return \ @ret ;
39
+ $bulk -> flush ;
36
40
}
37
41
38
- sub update_release_contirbutors {
39
- my ( $self , $data , $timeout ) = @_ ;
40
- return unless $data and is_arrayref($data );
41
-
42
- my $bulk = $self -> es-> bulk_helper( es_doc_path(' contributor' ),
43
- timeout => $timeout || ' 5m' , );
44
-
45
- for my $d ( @{$data } ) {
46
- my $id = digest( $d -> {pauseid }, $d -> {release_name } );
47
- $bulk -> update( {
48
- id => $id ,
49
- doc => {
50
- pauseid => $d -> {pauseid },
51
- release_name => $d -> {release_name },
52
- release_author => $d -> {release_author },
53
- distribution => $d -> {distribution },
42
+ sub release_contributor_update_actions {
43
+ my ( $self , $release ) = @_ ;
44
+ my @actions ;
45
+
46
+ my $res = $self -> es-> search(
47
+ es_doc_path(' contributor' ),
48
+ body => {
49
+ query => {
50
+ bool => {
51
+ must => [
52
+ { term => { release_name => $release -> {name } } },
53
+ { term => { release_author => $release -> {author } } },
54
+ ],
55
+ }
54
56
},
55
- doc_as_upsert => true,
56
- } );
57
+ sort => [' _doc' ],
58
+ size => 500,
59
+ _source => false,
60
+ },
61
+ );
62
+ my @ids = map $_ -> {_id }, @{ $res -> {hits }{hits } };
63
+ push @actions , map +{ delete => { id => $_ } }, @ids ;
64
+
65
+ my $contribs = $self -> get_contributors($release );
66
+ my @docs = map {
67
+ ;
68
+ my $contrib = $_ ;
69
+ {
70
+ release_name => $release -> {name },
71
+ release_author => $release -> {author },
72
+ distribution => $release -> {distribution },
73
+ map +( defined $contrib -> {$_ } ? ( $_ => $contrib -> {$_ } ) : () ),
74
+ qw( pauseid name email)
75
+ };
76
+ } @$contribs ;
77
+ push @actions , map +{ create => { _source => $_ } }, @docs ;
78
+ return \@actions ;
79
+ }
80
+
81
+ sub get_contributors {
82
+ my ( $self , $release ) = @_ ;
83
+
84
+ my $author_name = $release -> {author };
85
+ my $contribs = $release -> {metadata }{x_contributors } || [];
86
+ my $authors = $release -> {metadata }{author } || [];
87
+
88
+ for ( \( $contribs , $authors ) ) {
89
+
90
+ # If a sole contributor is a string upgrade it to an array...
91
+ $$_ = [$$_ ]
92
+ if !ref $$_ ;
93
+
94
+ # but if it's any other kind of value don't die trying to parse it.
95
+ $$_ = []
96
+ unless Ref::Util::is_arrayref($$_ );
57
97
}
98
+ $authors = [ grep { $_ ne ' unknown' } @$authors ];
58
99
59
- $bulk -> flush;
100
+ my $author = eval {
101
+ $self -> es-> get_source( es_doc_path(' author' ), id => $author_name );
102
+ }
103
+ or return [];
104
+
105
+ my $author_email = $author -> {email };
106
+
107
+ my $author_info = {
108
+ email => [
109
+ lc " $author_name \@ cpan.org" ,
110
+ (
111
+ Ref::Util::is_arrayref($author_email )
112
+ ? @{$author_email }
113
+ : $author_email
114
+ ),
115
+ ],
116
+ name => $author_name ,
117
+ };
118
+ my %seen = map { $_ => $author_info }
119
+ ( @{ $author_info -> {email } }, $author_info -> {name }, );
120
+
121
+ my @contribs = map {
122
+ my $name = $_ ;
123
+ my $email ;
124
+ if ( $name =~ s /\s *<([^<>]+@[^<>]+)>// ) {
125
+ $email = $1 ;
126
+ }
127
+ my $info ;
128
+ my $dupe ;
129
+ if ( $email and $info = $seen {$email } ) {
130
+ $dupe = 1;
131
+ }
132
+ elsif ( $info = $seen {$name } ) {
133
+ $dupe = 1;
134
+ }
135
+ else {
136
+ $info = {
137
+ name => $name ,
138
+ email => [],
139
+ };
140
+ }
141
+ $seen {$name } ||= $info ;
142
+ if ($email ) {
143
+ push @{ $info -> {email } }, $email
144
+ unless grep { $_ eq $email } @{ $info -> {email } };
145
+ $seen {$email } ||= $info ;
146
+ }
147
+ $dupe ? () : $info ;
148
+ } ( @$authors , @$contribs );
149
+
150
+ my %want_email ;
151
+ for my $contrib (@contribs ) {
152
+
153
+ # heuristic to autofill pause accounts
154
+ if ( !$contrib -> {pauseid } ) {
155
+ my ($pauseid )
156
+ = map { / ^(.*)\@ cpan\. org$ / ? $1 : () }
157
+ @{ $contrib -> {email } };
158
+ $contrib -> {pauseid } = uc $pauseid
159
+ if $pauseid ;
160
+
161
+ }
162
+
163
+ push @{ $want_email {$_ } }, $contrib for @{ $contrib -> {email } };
164
+ }
165
+
166
+ if (%want_email ) {
167
+ my $check_author = $self -> es-> search(
168
+ es_doc_path(' author' ),
169
+ body => {
170
+ query => { terms => { email => [ sort keys %want_email ] } },
171
+ _source => [ ' email' , ' pauseid' ],
172
+ size => 100,
173
+ },
174
+ );
175
+
176
+ for my $author ( @{ $check_author -> {hits }{hits } } ) {
177
+ my $emails = $author -> {_source }{email };
178
+ $emails = [$emails ]
179
+ if !ref $emails ;
180
+ my $pauseid = uc $author -> {_source }{pauseid };
181
+ for my $email (@$emails ) {
182
+ for my $contrib ( @{ $want_email {$email } } ) {
183
+ $contrib -> {pauseid } = $pauseid ;
184
+ }
185
+ }
186
+ }
187
+ }
188
+
189
+ return \@contribs ;
60
190
}
61
191
62
192
no Moose::Role;
0 commit comments