Skip to content

Commit 70ca750

Browse files
committed
include name and email in contributor index
All contributors, including those that can't be mapped to a PAUSE author, should be included in the contributor index. This will allow the web end points to look up all contributor information via the index, rather than needing to re-calculate for each call. Adds the name and email fields to the contributor index, and populates them when calculating contributor data. The PAUSE id is no longer required, so it can't be used for the id of the ES doc. Instead, we delete all of the ids for a release before updating the data. Previously, the contributors script would ignore existing data. Now, it will delete and recreate the data. For now, the release/contributors end point will still recalculate the data for each call. After this change is deployed and all of the data is updated, it can be updated to use the contributor index and be much simpler. This will also reduce the code shared between indexing and the API.
1 parent 48080b0 commit 70ca750

File tree

6 files changed

+227
-85
lines changed

6 files changed

+227
-85
lines changed

es/contributor/mapping.json

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,16 @@
1111
"index": "not_analyzed",
1212
"type": "string"
1313
},
14+
"name": {
15+
"ignore_above": 2048,
16+
"index": "not_analyzed",
17+
"type": "string"
18+
},
19+
"email": {
20+
"ignore_above": 2048,
21+
"index": "not_analyzed",
22+
"type": "string"
23+
},
1424
"release_author": {
1525
"ignore_above": 2048,
1626
"index": "not_analyzed",

lib/MetaCPAN/Document/Contributor.pm

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ package MetaCPAN::Document::Contributor;
33
use MetaCPAN::Moose;
44

55
use ElasticSearchX::Model::Document;
6-
use MetaCPAN::Types::TypeTiny qw( Str );
6+
use MetaCPAN::Types::TypeTiny qw( ArrayRef Str );
77

88
has distribution => (
99
is => 'ro',
@@ -24,9 +24,18 @@ has release_name => (
2424
);
2525

2626
has pauseid => (
27-
is => 'ro',
28-
isa => Str,
29-
required => 1,
27+
is => 'ro',
28+
isa => Str,
29+
);
30+
31+
has name => (
32+
is => 'ro',
33+
isa => Str,
34+
);
35+
36+
has email => (
37+
is => 'ro',
38+
isa => ArrayRef [Str],
3039
);
3140

3241
__PACKAGE__->meta->make_immutable;

lib/MetaCPAN/Query/Contributor.pm

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,17 +13,24 @@ sub find_release_contributors {
1313
my $query = +{
1414
bool => {
1515
must => [
16-
{ term => { release_author => $author } },
17-
{ term => { release_name => $name } },
16+
{ term => { release_author => $author } },
17+
{ term => { release_name => $name } },
18+
{ exists => { field => 'pauseid' } },
1819
]
1920
}
2021
};
2122

2223
my $res = $self->es->search(
2324
es_doc_path('contributor'),
2425
body => {
25-
query => $query,
26-
size => 999,
26+
query => $query,
27+
size => 999,
28+
_source => [ qw(
29+
distribution
30+
pauseid
31+
release_author
32+
release_name
33+
) ],
2734
}
2835
);
2936
hit_total($res) or return {};
@@ -40,11 +47,17 @@ sub find_author_contributions {
4047
my $res = $self->es->search(
4148
es_doc_path('contributor'),
4249
body => {
43-
query => $query,
44-
size => 999,
50+
query => $query,
51+
size => 999,
52+
_source => [ qw(
53+
distribution
54+
pauseid
55+
release_author
56+
release_name
57+
) ],
4558
}
4659
);
47-
$res->{hits}{total} or return {};
60+
hit_total($res) or return {};
4861

4962
return +{
5063
contributors => [ map { $_->{_source} } @{ $res->{hits}{hits} } ] };

lib/MetaCPAN/Script/Contributor.pm

Lines changed: 1 addition & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -80,32 +80,7 @@ sub run {
8080
? { range => { date => { gte => sprintf( 'now-%dd', $self->age ) } } }
8181
: return;
8282

83-
my $timeout = $self->all ? '720m' : '5m';
84-
85-
my $scroll = $self->es->scroll_helper(
86-
scroll => $timeout,
87-
es_doc_path('release'),
88-
body => {
89-
query => $query,
90-
size => 500,
91-
_source => [qw( author distribution name )],
92-
},
93-
);
94-
95-
my @data;
96-
97-
while ( my $r = $scroll->next ) {
98-
my $contrib_data = $self->get_cpan_author_contributors(
99-
$r->{_source}{author},
100-
$r->{_source}{name},
101-
$r->{_source}{distribution},
102-
);
103-
next unless is_arrayref($contrib_data);
104-
log_debug { 'adding release ' . $r->{_source}{name} };
105-
push @data => @{$contrib_data};
106-
}
107-
108-
$self->update_release_contirbutors( \@data, $timeout );
83+
$self->update_contributors($query);
10984
}
11085

11186
__PACKAGE__->meta->make_immutable;

lib/MetaCPAN/Script/Release.pm

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -383,9 +383,14 @@ sub import_archive {
383383
MetaCPAN::Script::Runner->run;
384384
}
385385

386-
my $contrib_data = $self->get_cpan_author_contributors( $document->author,
387-
$document->name, $document->distribution );
388-
$self->update_release_contirbutors($contrib_data);
386+
$self->update_contributors( {
387+
bool => {
388+
must => [
389+
{ term => { author => $document->author } },
390+
{ term => { name => $document->name } },
391+
],
392+
},
393+
} );
389394
}
390395

391396
sub detect_status {

lib/MetaCPAN/Script/Role/Contributor.pm

Lines changed: 175 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -2,61 +2,191 @@ package MetaCPAN::Script::Role::Contributor;
22

33
use Moose::Role;
44

5+
use Log::Contextual qw( :log );
56
use MetaCPAN::ESConfig qw( es_doc_path );
6-
use MetaCPAN::Util qw( digest true false );
7+
use MetaCPAN::Util qw( true false );
78
use Ref::Util qw( is_arrayref );
89

9-
sub get_cpan_author_contributors {
10-
my ( $self, $author, $release, $distribution ) = @_;
11-
my @ret;
12-
my $es = $self->es;
13-
14-
my $type = $self->model->doc('release');
15-
my $data;
16-
eval {
17-
$data = $type->get_contributors( $author, $release );
18-
1;
19-
} or return [];
20-
21-
for my $d ( @{ $data->{contributors} } ) {
22-
next unless exists $d->{pauseid};
23-
24-
# skip existing records
25-
my $id = digest( $d->{pauseid}, $release );
26-
my $exists = $es->exists( es_doc_path('contributor'), id => $id, );
27-
next if $exists;
28-
29-
$d->{release_author} = $author;
30-
$d->{release_name} = $release;
31-
$d->{distribution} = $distribution;
32-
push @ret, $d;
10+
sub update_contributors {
11+
my ( $self, $query ) = @_;
12+
13+
my $scroll = $self->es->scroll_helper(
14+
es_doc_path('release'),
15+
body => {
16+
query => $query,
17+
sort => ['_doc'],
18+
_source => [ qw<
19+
name
20+
author
21+
distribution
22+
metadata.author
23+
metadata.x_contributors
24+
> ],
25+
},
26+
);
27+
28+
my $bulk = $self->es->bulk_helper( es_doc_path('contributor') );
29+
30+
while ( my $release = $scroll->next ) {
31+
log_debug { 'updating contributors for ' . $release->{_source}{name} };
32+
my $actions = $self->release_contributor_update_actions(
33+
$release->{_source} );
34+
for my $action (@$actions) {
35+
$bulk->add_action(%$action);
36+
}
3337
}
3438

35-
return \@ret;
39+
$bulk->flush;
3640
}
3741

38-
sub update_release_contirbutors {
39-
my ( $self, $data, $timeout ) = @_;
40-
return unless $data and is_arrayref($data);
41-
42-
my $bulk = $self->es->bulk_helper( es_doc_path('contributor'),
43-
timeout => $timeout || '5m', );
44-
45-
for my $d ( @{$data} ) {
46-
my $id = digest( $d->{pauseid}, $d->{release_name} );
47-
$bulk->update( {
48-
id => $id,
49-
doc => {
50-
pauseid => $d->{pauseid},
51-
release_name => $d->{release_name},
52-
release_author => $d->{release_author},
53-
distribution => $d->{distribution},
42+
sub release_contributor_update_actions {
43+
my ( $self, $release ) = @_;
44+
my @actions;
45+
46+
my $res = $self->es->search(
47+
es_doc_path('contributor'),
48+
body => {
49+
query => {
50+
bool => {
51+
must => [
52+
{ term => { release_name => $release->{name} } },
53+
{ term => { release_author => $release->{author} } },
54+
],
55+
}
5456
},
55-
doc_as_upsert => true,
56-
} );
57+
sort => ['_doc'],
58+
size => 500,
59+
_source => false,
60+
},
61+
);
62+
my @ids = map $_->{_id}, @{ $res->{hits}{hits} };
63+
push @actions, map +{ delete => { id => $_ } }, @ids;
64+
65+
my $contribs = $self->get_contributors($release);
66+
my @docs = map {
67+
;
68+
my $contrib = $_;
69+
{
70+
release_name => $release->{name},
71+
release_author => $release->{author},
72+
distribution => $release->{distribution},
73+
map +( defined $contrib->{$_} ? ( $_ => $contrib->{$_} ) : () ),
74+
qw(pauseid name email)
75+
};
76+
} @$contribs;
77+
push @actions, map +{ create => { _source => $_ } }, @docs;
78+
return \@actions;
79+
}
80+
81+
sub get_contributors {
82+
my ( $self, $release ) = @_;
83+
84+
my $author_name = $release->{author};
85+
my $contribs = $release->{metadata}{x_contributors} || [];
86+
my $authors = $release->{metadata}{author} || [];
87+
88+
for ( \( $contribs, $authors ) ) {
89+
90+
# If a sole contributor is a string upgrade it to an array...
91+
$$_ = [$$_]
92+
if !ref $$_;
93+
94+
# but if it's any other kind of value don't die trying to parse it.
95+
$$_ = []
96+
unless Ref::Util::is_arrayref($$_);
5797
}
98+
$authors = [ grep { $_ ne 'unknown' } @$authors ];
5899

59-
$bulk->flush;
100+
my $author = eval {
101+
$self->es->get_source( es_doc_path('author'), id => $author_name );
102+
}
103+
or return [];
104+
105+
my $author_email = $author->{email};
106+
107+
my $author_info = {
108+
email => [
109+
lc "$author_name\@cpan.org",
110+
(
111+
Ref::Util::is_arrayref($author_email)
112+
? @{$author_email}
113+
: $author_email
114+
),
115+
],
116+
name => $author_name,
117+
};
118+
my %seen = map { $_ => $author_info }
119+
( @{ $author_info->{email} }, $author_info->{name}, );
120+
121+
my @contribs = map {
122+
my $name = $_;
123+
my $email;
124+
if ( $name =~ s/\s*<([^<>]+@[^<>]+)>// ) {
125+
$email = $1;
126+
}
127+
my $info;
128+
my $dupe;
129+
if ( $email and $info = $seen{$email} ) {
130+
$dupe = 1;
131+
}
132+
elsif ( $info = $seen{$name} ) {
133+
$dupe = 1;
134+
}
135+
else {
136+
$info = {
137+
name => $name,
138+
email => [],
139+
};
140+
}
141+
$seen{$name} ||= $info;
142+
if ($email) {
143+
push @{ $info->{email} }, $email
144+
unless grep { $_ eq $email } @{ $info->{email} };
145+
$seen{$email} ||= $info;
146+
}
147+
$dupe ? () : $info;
148+
} ( @$authors, @$contribs );
149+
150+
my %want_email;
151+
for my $contrib (@contribs) {
152+
153+
# heuristic to autofill pause accounts
154+
if ( !$contrib->{pauseid} ) {
155+
my ($pauseid)
156+
= map { /^(.*)\@cpan\.org$/ ? $1 : () }
157+
@{ $contrib->{email} };
158+
$contrib->{pauseid} = uc $pauseid
159+
if $pauseid;
160+
161+
}
162+
163+
push @{ $want_email{$_} }, $contrib for @{ $contrib->{email} };
164+
}
165+
166+
if (%want_email) {
167+
my $check_author = $self->es->search(
168+
es_doc_path('author'),
169+
body => {
170+
query => { terms => { email => [ sort keys %want_email ] } },
171+
_source => [ 'email', 'pauseid' ],
172+
size => 100,
173+
},
174+
);
175+
176+
for my $author ( @{ $check_author->{hits}{hits} } ) {
177+
my $emails = $author->{_source}{email};
178+
$emails = [$emails]
179+
if !ref $emails;
180+
my $pauseid = uc $author->{_source}{pauseid};
181+
for my $email (@$emails) {
182+
for my $contrib ( @{ $want_email{$email} } ) {
183+
$contrib->{pauseid} = $pauseid;
184+
}
185+
}
186+
}
187+
}
188+
189+
return \@contribs;
60190
}
61191

62192
no Moose::Role;

0 commit comments

Comments
 (0)