Skip to content

Commit 232b751

Browse files
authored
Merge pull request #1322 from metacpan/haarg/index-contrib-name-email
include name and email in contributor index
2 parents 2656fc0 + 70ca750 commit 232b751

File tree

6 files changed

+227
-85
lines changed

6 files changed

+227
-85
lines changed

es/contributor/mapping.json

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,16 @@
1111
"index": "not_analyzed",
1212
"type": "string"
1313
},
14+
"name": {
15+
"ignore_above": 2048,
16+
"index": "not_analyzed",
17+
"type": "string"
18+
},
19+
"email": {
20+
"ignore_above": 2048,
21+
"index": "not_analyzed",
22+
"type": "string"
23+
},
1424
"release_author": {
1525
"ignore_above": 2048,
1626
"index": "not_analyzed",

lib/MetaCPAN/Document/Contributor.pm

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ package MetaCPAN::Document::Contributor;
33
use MetaCPAN::Moose;
44

55
use ElasticSearchX::Model::Document;
6-
use MetaCPAN::Types::TypeTiny qw( Str );
6+
use MetaCPAN::Types::TypeTiny qw( ArrayRef Str );
77

88
has distribution => (
99
is => 'ro',
@@ -24,9 +24,18 @@ has release_name => (
2424
);
2525

2626
has pauseid => (
27-
is => 'ro',
28-
isa => Str,
29-
required => 1,
27+
is => 'ro',
28+
isa => Str,
29+
);
30+
31+
has name => (
32+
is => 'ro',
33+
isa => Str,
34+
);
35+
36+
has email => (
37+
is => 'ro',
38+
isa => ArrayRef [Str],
3039
);
3140

3241
__PACKAGE__->meta->make_immutable;

lib/MetaCPAN/Query/Contributor.pm

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,17 +13,24 @@ sub find_release_contributors {
1313
my $query = +{
1414
bool => {
1515
must => [
16-
{ term => { release_author => $author } },
17-
{ term => { release_name => $name } },
16+
{ term => { release_author => $author } },
17+
{ term => { release_name => $name } },
18+
{ exists => { field => 'pauseid' } },
1819
]
1920
}
2021
};
2122

2223
my $res = $self->es->search(
2324
es_doc_path('contributor'),
2425
body => {
25-
query => $query,
26-
size => 999,
26+
query => $query,
27+
size => 999,
28+
_source => [ qw(
29+
distribution
30+
pauseid
31+
release_author
32+
release_name
33+
) ],
2734
}
2835
);
2936
hit_total($res) or return {};
@@ -40,11 +47,17 @@ sub find_author_contributions {
4047
my $res = $self->es->search(
4148
es_doc_path('contributor'),
4249
body => {
43-
query => $query,
44-
size => 999,
50+
query => $query,
51+
size => 999,
52+
_source => [ qw(
53+
distribution
54+
pauseid
55+
release_author
56+
release_name
57+
) ],
4558
}
4659
);
47-
$res->{hits}{total} or return {};
60+
hit_total($res) or return {};
4861

4962
return +{
5063
contributors => [ map { $_->{_source} } @{ $res->{hits}{hits} } ] };

lib/MetaCPAN/Script/Contributor.pm

Lines changed: 1 addition & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -80,32 +80,7 @@ sub run {
8080
? { range => { date => { gte => sprintf( 'now-%dd', $self->age ) } } }
8181
: return;
8282

83-
my $timeout = $self->all ? '720m' : '5m';
84-
85-
my $scroll = $self->es->scroll_helper(
86-
scroll => $timeout,
87-
es_doc_path('release'),
88-
body => {
89-
query => $query,
90-
size => 500,
91-
_source => [qw( author distribution name )],
92-
},
93-
);
94-
95-
my @data;
96-
97-
while ( my $r = $scroll->next ) {
98-
my $contrib_data = $self->get_cpan_author_contributors(
99-
$r->{_source}{author},
100-
$r->{_source}{name},
101-
$r->{_source}{distribution},
102-
);
103-
next unless is_arrayref($contrib_data);
104-
log_debug { 'adding release ' . $r->{_source}{name} };
105-
push @data => @{$contrib_data};
106-
}
107-
108-
$self->update_release_contirbutors( \@data, $timeout );
83+
$self->update_contributors($query);
10984
}
11085

11186
__PACKAGE__->meta->make_immutable;

lib/MetaCPAN/Script/Release.pm

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -383,9 +383,14 @@ sub import_archive {
383383
MetaCPAN::Script::Runner->run;
384384
}
385385

386-
my $contrib_data = $self->get_cpan_author_contributors( $document->author,
387-
$document->name, $document->distribution );
388-
$self->update_release_contirbutors($contrib_data);
386+
$self->update_contributors( {
387+
bool => {
388+
must => [
389+
{ term => { author => $document->author } },
390+
{ term => { name => $document->name } },
391+
],
392+
},
393+
} );
389394
}
390395

391396
sub detect_status {

lib/MetaCPAN/Script/Role/Contributor.pm

Lines changed: 175 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -2,61 +2,191 @@ package MetaCPAN::Script::Role::Contributor;
22

33
use Moose::Role;
44

5+
use Log::Contextual qw( :log );
56
use MetaCPAN::ESConfig qw( es_doc_path );
6-
use MetaCPAN::Util qw( digest true false );
7+
use MetaCPAN::Util qw( true false );
78
use Ref::Util qw( is_arrayref );
89

9-
sub get_cpan_author_contributors {
10-
my ( $self, $author, $release, $distribution ) = @_;
11-
my @ret;
12-
my $es = $self->es;
13-
14-
my $type = $self->model->doc('release');
15-
my $data;
16-
eval {
17-
$data = $type->get_contributors( $author, $release );
18-
1;
19-
} or return [];
20-
21-
for my $d ( @{ $data->{contributors} } ) {
22-
next unless exists $d->{pauseid};
23-
24-
# skip existing records
25-
my $id = digest( $d->{pauseid}, $release );
26-
my $exists = $es->exists( es_doc_path('contributor'), id => $id, );
27-
next if $exists;
28-
29-
$d->{release_author} = $author;
30-
$d->{release_name} = $release;
31-
$d->{distribution} = $distribution;
32-
push @ret, $d;
10+
sub update_contributors {
11+
my ( $self, $query ) = @_;
12+
13+
my $scroll = $self->es->scroll_helper(
14+
es_doc_path('release'),
15+
body => {
16+
query => $query,
17+
sort => ['_doc'],
18+
_source => [ qw<
19+
name
20+
author
21+
distribution
22+
metadata.author
23+
metadata.x_contributors
24+
> ],
25+
},
26+
);
27+
28+
my $bulk = $self->es->bulk_helper( es_doc_path('contributor') );
29+
30+
while ( my $release = $scroll->next ) {
31+
log_debug { 'updating contributors for ' . $release->{_source}{name} };
32+
my $actions = $self->release_contributor_update_actions(
33+
$release->{_source} );
34+
for my $action (@$actions) {
35+
$bulk->add_action(%$action);
36+
}
3337
}
3438

35-
return \@ret;
39+
$bulk->flush;
3640
}
3741

38-
sub update_release_contirbutors {
39-
my ( $self, $data, $timeout ) = @_;
40-
return unless $data and is_arrayref($data);
41-
42-
my $bulk = $self->es->bulk_helper( es_doc_path('contributor'),
43-
timeout => $timeout || '5m', );
44-
45-
for my $d ( @{$data} ) {
46-
my $id = digest( $d->{pauseid}, $d->{release_name} );
47-
$bulk->update( {
48-
id => $id,
49-
doc => {
50-
pauseid => $d->{pauseid},
51-
release_name => $d->{release_name},
52-
release_author => $d->{release_author},
53-
distribution => $d->{distribution},
42+
sub release_contributor_update_actions {
43+
my ( $self, $release ) = @_;
44+
my @actions;
45+
46+
my $res = $self->es->search(
47+
es_doc_path('contributor'),
48+
body => {
49+
query => {
50+
bool => {
51+
must => [
52+
{ term => { release_name => $release->{name} } },
53+
{ term => { release_author => $release->{author} } },
54+
],
55+
}
5456
},
55-
doc_as_upsert => true,
56-
} );
57+
sort => ['_doc'],
58+
size => 500,
59+
_source => false,
60+
},
61+
);
62+
my @ids = map $_->{_id}, @{ $res->{hits}{hits} };
63+
push @actions, map +{ delete => { id => $_ } }, @ids;
64+
65+
my $contribs = $self->get_contributors($release);
66+
my @docs = map {
67+
;
68+
my $contrib = $_;
69+
{
70+
release_name => $release->{name},
71+
release_author => $release->{author},
72+
distribution => $release->{distribution},
73+
map +( defined $contrib->{$_} ? ( $_ => $contrib->{$_} ) : () ),
74+
qw(pauseid name email)
75+
};
76+
} @$contribs;
77+
push @actions, map +{ create => { _source => $_ } }, @docs;
78+
return \@actions;
79+
}
80+
81+
sub get_contributors {
82+
my ( $self, $release ) = @_;
83+
84+
my $author_name = $release->{author};
85+
my $contribs = $release->{metadata}{x_contributors} || [];
86+
my $authors = $release->{metadata}{author} || [];
87+
88+
for ( \( $contribs, $authors ) ) {
89+
90+
# If a sole contributor is a string upgrade it to an array...
91+
$$_ = [$$_]
92+
if !ref $$_;
93+
94+
# but if it's any other kind of value don't die trying to parse it.
95+
$$_ = []
96+
unless Ref::Util::is_arrayref($$_);
5797
}
98+
$authors = [ grep { $_ ne 'unknown' } @$authors ];
5899

59-
$bulk->flush;
100+
my $author = eval {
101+
$self->es->get_source( es_doc_path('author'), id => $author_name );
102+
}
103+
or return [];
104+
105+
my $author_email = $author->{email};
106+
107+
my $author_info = {
108+
email => [
109+
lc "$author_name\@cpan.org",
110+
(
111+
Ref::Util::is_arrayref($author_email)
112+
? @{$author_email}
113+
: $author_email
114+
),
115+
],
116+
name => $author_name,
117+
};
118+
my %seen = map { $_ => $author_info }
119+
( @{ $author_info->{email} }, $author_info->{name}, );
120+
121+
my @contribs = map {
122+
my $name = $_;
123+
my $email;
124+
if ( $name =~ s/\s*<([^<>]+@[^<>]+)>// ) {
125+
$email = $1;
126+
}
127+
my $info;
128+
my $dupe;
129+
if ( $email and $info = $seen{$email} ) {
130+
$dupe = 1;
131+
}
132+
elsif ( $info = $seen{$name} ) {
133+
$dupe = 1;
134+
}
135+
else {
136+
$info = {
137+
name => $name,
138+
email => [],
139+
};
140+
}
141+
$seen{$name} ||= $info;
142+
if ($email) {
143+
push @{ $info->{email} }, $email
144+
unless grep { $_ eq $email } @{ $info->{email} };
145+
$seen{$email} ||= $info;
146+
}
147+
$dupe ? () : $info;
148+
} ( @$authors, @$contribs );
149+
150+
my %want_email;
151+
for my $contrib (@contribs) {
152+
153+
# heuristic to autofill pause accounts
154+
if ( !$contrib->{pauseid} ) {
155+
my ($pauseid)
156+
= map { /^(.*)\@cpan\.org$/ ? $1 : () }
157+
@{ $contrib->{email} };
158+
$contrib->{pauseid} = uc $pauseid
159+
if $pauseid;
160+
161+
}
162+
163+
push @{ $want_email{$_} }, $contrib for @{ $contrib->{email} };
164+
}
165+
166+
if (%want_email) {
167+
my $check_author = $self->es->search(
168+
es_doc_path('author'),
169+
body => {
170+
query => { terms => { email => [ sort keys %want_email ] } },
171+
_source => [ 'email', 'pauseid' ],
172+
size => 100,
173+
},
174+
);
175+
176+
for my $author ( @{ $check_author->{hits}{hits} } ) {
177+
my $emails = $author->{_source}{email};
178+
$emails = [$emails]
179+
if !ref $emails;
180+
my $pauseid = uc $author->{_source}{pauseid};
181+
for my $email (@$emails) {
182+
for my $contrib ( @{ $want_email{$email} } ) {
183+
$contrib->{pauseid} = $pauseid;
184+
}
185+
}
186+
}
187+
}
188+
189+
return \@contribs;
60190
}
61191

62192
no Moose::Role;

0 commit comments

Comments
 (0)