Skip to content

Commit 693cde3

Browse files
committed
vulnerability reference scraping and embeddings
1 parent 74948f9 commit 693cde3

File tree

37 files changed

+1824
-742
lines changed

37 files changed

+1824
-742
lines changed

lunatrace/bsl/docker-compose.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ services:
8787
- "host.docker.internal:host-gateway"
8888

8989
postgres:
90-
image: postgres:12
90+
image: ankane/pgvector:v0.4.0
9191
restart: always
9292
# Uncomment this if you want Postgres to log queries out.
9393
# command: ["postgres", "-c", "logging_collector=on", "-c", "log_directory=/tmp", "-c", "log_filename=postgresql.log", "-c", "log_statement=all"]

lunatrace/bsl/hasura/metadata/databases/lunatrace/tables/tables.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,8 @@
5353
- "!include vulnerability_equivalent.yaml"
5454
- "!include vulnerability_range.yaml"
5555
- "!include vulnerability_reference.yaml"
56+
- "!include vulnerability_reference_content.yaml"
57+
- "!include vulnerability_reference_embedding.yaml"
5658
- "!include vulnerability_severity.yaml"
5759
- "!include vulnerability_vulnerability.yaml"
5860
- "!include vulnerability_vulnerability_cwe.yaml"
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
table:
2+
name: reference_content
3+
schema: vulnerability
4+
object_relationships:
5+
- name: reference
6+
using:
7+
foreign_key_constraint_on: reference_id
8+
array_relationships:
9+
- name: reference_embeddings
10+
using:
11+
foreign_key_constraint_on:
12+
column: reference_content_id
13+
table:
14+
name: reference_embedding
15+
schema: vulnerability
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
table:
2+
name: reference_embedding
3+
schema: vulnerability
4+
object_relationships:
5+
- name: reference_content
6+
using:
7+
foreign_key_constraint_on: reference_content_id

lunatrace/bsl/hasura/metadata/remote_schemas.yaml

Lines changed: 41 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -13,81 +13,81 @@
1313
- role: user
1414
definition:
1515
schema: |
16+
scalar JSON
17+
scalar UUID
1618
type AuthenticatedRepoCloneUrlOutput {
17-
url: String
19+
url: String
1820
}
19-
scalar JSON
2021
type Mutation {
21-
presignManifestUpload(project_id: UUID!): PresignedUrlResponse
22+
presignManifestUpload(project_id: UUID!): PresignedUrlResponse
2223
}
2324
type PresignedUrlResponse {
24-
bucket: String!
25-
headers: JSON!
26-
key: String!
27-
url: String!
25+
bucket: String!
26+
headers: JSON!
27+
key: String!
28+
url: String!
2829
}
2930
type Query {
30-
authenticatedRepoCloneUrl(repoGithubId: Int!): AuthenticatedRepoCloneUrlOutput
31-
fakeQueryToHackHasuraBeingABuggyMess: String
32-
sbomUrl(buildId: UUID!): String
31+
authenticatedRepoCloneUrl(repoGithubId: Int!): AuthenticatedRepoCloneUrlOutput
32+
fakeQueryToHackHasuraBeingABuggyMess: String
33+
sbomUrl(buildId: UUID!): String
3334
}
3435
type SbomUploadUrlOutput {
35-
error: Boolean!
36-
uploadUrl: UploadUrl
36+
error: Boolean!
37+
uploadUrl: UploadUrl
3738
}
38-
scalar UUID
3939
type UploadUrl {
40-
headers: JSON!
41-
url: String!
40+
headers: JSON!
41+
url: String!
4242
}
4343
- role: service
4444
definition:
4545
schema: |
46+
scalar JSON
47+
scalar UUID
4648
type AuthenticatedRepoCloneUrlOutput {
47-
url: String
49+
url: String
4850
}
49-
scalar JSON
5051
type Mutation {
51-
presignManifestUpload(project_id: UUID!): PresignedUrlResponse
52+
presignManifestUpload(project_id: UUID!): PresignedUrlResponse
5253
}
5354
type PresignedUrlResponse {
54-
bucket: String!
55-
headers: JSON!
56-
key: String!
57-
url: String!
55+
bucket: String!
56+
headers: JSON!
57+
key: String!
58+
url: String!
5859
}
5960
type Query {
60-
authenticatedRepoCloneUrl(repoGithubId: Int!): AuthenticatedRepoCloneUrlOutput
61-
fakeQueryToHackHasuraBeingABuggyMess: String
62-
presignSbomUpload(orgId: UUID!, buildId: UUID!): SbomUploadUrlOutput
63-
sbomUrl(buildId: UUID!): String
64-
}
65-
input SbomUploadUrlInput {
66-
orgId: UUID!
67-
projectId: UUID!
61+
authenticatedRepoCloneUrl(repoGithubId: Int!): AuthenticatedRepoCloneUrlOutput
62+
fakeQueryToHackHasuraBeingABuggyMess: String
63+
presignSbomUpload(orgId: UUID!, buildId: UUID!): SbomUploadUrlOutput
64+
sbomUrl(buildId: UUID!): String
6865
}
6966
type SbomUploadUrlOutput {
70-
error: Boolean!
71-
uploadUrl: UploadUrl
67+
error: Boolean!
68+
uploadUrl: UploadUrl
7269
}
73-
scalar UUID
7470
type UploadUrl {
75-
headers: JSON!
76-
url: String!
71+
headers: JSON!
72+
url: String!
73+
}
74+
input SbomUploadUrlInput {
75+
orgId: UUID!
76+
projectId: UUID!
7777
}
7878
- role: cli
7979
definition:
8080
schema: |
8181
scalar JSON
82+
scalar UUID
8283
type Query {
83-
presignSbomUpload(orgId: UUID!, buildId: UUID!): SbomUploadUrlOutput
84+
presignSbomUpload(orgId: UUID!, buildId: UUID!): SbomUploadUrlOutput
8485
}
8586
type SbomUploadUrlOutput {
86-
error: Boolean!
87-
uploadUrl: UploadUrl
87+
error: Boolean!
88+
uploadUrl: UploadUrl
8889
}
89-
scalar UUID
9090
type UploadUrl {
91-
headers: JSON!
92-
url: String!
91+
headers: JSON!
92+
url: String!
9393
}
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
DROP FUNCTION vulnerability.match_reference_embedding_for_vulnerability;
2+
DROP TABLE "vulnerability"."reference_embedding";
3+
DROP TABLE "vulnerability"."reference_content";
4+
DROP EXTENSION vector;
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
CREATE EXTENSION vector;
2+
3+
CREATE TABLE "vulnerability"."reference_content" (
4+
"id" uuid NOT NULL DEFAULT gen_random_uuid(),
5+
"reference_id" uuid NOT NULL REFERENCES "vulnerability"."reference"("id") ON UPDATE cascade ON DELETE cascade,
6+
"title" text NOT NULL,
7+
"content" text NOT NULL,
8+
"normalized_content" text NOT NULL,
9+
"content_type" text NOT NULL,
10+
"last_successful_fetch" timestamptz DEFAULT NULL,
11+
PRIMARY KEY ("id"),
12+
UNIQUE ("reference_id")
13+
);
14+
15+
CREATE TABLE "vulnerability"."reference_embedding" (
16+
"id" uuid NOT NULL DEFAULT gen_random_uuid(),
17+
"content_hash" text NOT NULL,
18+
"reference_content_id" uuid NOT NULL REFERENCES "vulnerability"."reference_content"("id") ON UPDATE cascade ON DELETE cascade,
19+
"content" text NOT NULL,
20+
"embedding" vector (1536) NOT NULL,
21+
PRIMARY KEY ("id"),
22+
UNIQUE ("content_hash")
23+
);
24+
25+
CREATE INDEX ON "vulnerability"."reference_embedding"
26+
USING ivfflat (embedding vector_cosine_ops)
27+
WITH (lists = 100);
28+
29+
create or replace function vulnerability.match_reference_embedding_for_vulnerability (
30+
query_embedding vector(1536),
31+
vuln_id text,
32+
similarity_threshold float,
33+
match_count int
34+
)
35+
returns table (
36+
id uuid,
37+
url text,
38+
content text,
39+
similarity float
40+
)
41+
language plpgsql
42+
as $$
43+
begin
44+
return query
45+
select
46+
r.id,
47+
r.url,
48+
re.content,
49+
1 - (re.embedding <=> query_embedding) as similarity
50+
from vulnerability.reference_embedding re
51+
join vulnerability.reference_content rc on rc.id = re.reference_content_id
52+
join vulnerability.reference r on r.id = rc.reference_id
53+
join vulnerability.vulnerability v on v.id = r.vulnerability_id
54+
where 1 - (re.embedding <=> query_embedding) > similarity_threshold
55+
and v.source_id = vuln_id
56+
order by re.embedding <=> query_embedding
57+
limit match_count;
58+
end;
59+
$$;

lunatrace/bsl/ingest-worker/cmd/ingestworker/main.go

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -35,12 +35,13 @@ import (
3535
"github.com/lunasec-io/lunasec/lunatrace/bsl/ingest-worker/pkg/metadata/registry"
3636
"github.com/lunasec-io/lunasec/lunatrace/bsl/ingest-worker/pkg/metadata/replicator"
3737
"github.com/lunasec-io/lunasec/lunatrace/bsl/ingest-worker/pkg/metadata/replicator/npm"
38+
"github.com/lunasec-io/lunasec/lunatrace/bsl/ingest-worker/pkg/ml"
3839
"github.com/lunasec-io/lunasec/lunatrace/bsl/ingest-worker/pkg/openaifx"
39-
"github.com/lunasec-io/lunasec/lunatrace/bsl/ingest-worker/pkg/pineconefx"
4040
"github.com/lunasec-io/lunasec/lunatrace/bsl/ingest-worker/pkg/scanner/licensecheck"
4141
"github.com/lunasec-io/lunasec/lunatrace/bsl/ingest-worker/pkg/scanner/packagejson"
42+
"github.com/lunasec-io/lunasec/lunatrace/bsl/ingest-worker/pkg/vulnbot"
4243
"github.com/lunasec-io/lunasec/lunatrace/bsl/ingest-worker/pkg/vulnerability/affected"
43-
"github.com/lunasec-io/lunasec/lunatrace/bsl/ingest-worker/pkg/vulnerability/process"
44+
"github.com/lunasec-io/lunasec/lunatrace/bsl/ingest-worker/pkg/vulnerability/scrape"
4445

4546
"github.com/lunasec-io/lunasec/lunatrace/bsl/ingest-worker/cmd/ingestworker/license"
4647
"github.com/lunasec-io/lunasec/lunatrace/bsl/ingest-worker/pkg/metadata/ingester"
@@ -49,7 +50,11 @@ import (
4950

5051
func main() {
5152
// TODO (cthompson) this should be configured with an fx module
52-
log.Logger = zerolog.New(os.Stderr).With().Timestamp().Logger()
53+
logLevel := zerolog.InfoLevel
54+
if os.Getenv("LOG_LEVEL") == "debug" {
55+
logLevel = zerolog.DebugLevel
56+
}
57+
log.Logger = zerolog.New(os.Stderr).With().Timestamp().Logger().Level(logLevel)
5358

5459
clifx.Main(
5560
// TODO (cthompson) move this into an fx module
@@ -60,13 +65,19 @@ func main() {
6065
registry.NPMModule,
6166
ingester.Module,
6267
openaifx.Module,
63-
pineconefx.Module,
68+
scrape.Module,
6469

6570
fx.Provide(
71+
ml.NewService,
6672
cwe2.NewCWEIngester,
6773
epss2.NewEPSSIngester,
6874
cisa2.NewCISAKnownVulnIngester,
69-
process.NewProcessor,
75+
vulnbot.NewVulnBot,
76+
),
77+
78+
fx.Provide(
79+
vulnmanager.NewProcessor,
80+
//metadata.NewProcessor,
7081
),
7182

7283
// todo make a module
@@ -96,8 +107,6 @@ func main() {
96107
cwe.NewCommand,
97108
epss.NewCommand,
98109
cisa.NewCommand,
99-
),
100-
fx.Provide(
101110
packageCommand.NewCommand,
102111
),
103112
)

lunatrace/bsl/ingest-worker/cmd/ingestworker/package/package.go

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ type Params struct {
3030
Ingester metadata.PackageIngester
3131
Replicator metadata.Replicator
3232
APIReplicator metadata.APIReplicator
33+
//Processor metadata.Processor
3334
}
3435

3536
func NewCommand(p Params) clifx.CommandResult {
@@ -56,6 +57,11 @@ func NewCommand(p Params) clifx.CommandResult {
5657
Required: false,
5758
Usage: "If a package ingestion fails, continue without fatally failing.",
5859
},
60+
&cli.BoolFlag{
61+
Name: "references",
62+
Required: false,
63+
Usage: "Only ingest package references.",
64+
},
5965
&cli.DurationFlag{
6066
Name: "refetch-duration",
6167
Required: false,
@@ -66,12 +72,13 @@ func NewCommand(p Params) clifx.CommandResult {
6672
packageName := ctx.Args().First()
6773
registry := ctx.Bool("registry")
6874
ignoreErrors := ctx.Bool("ignore-errors")
75+
references := ctx.Bool("references")
6976
packagesFile := ctx.String("packages")
7077
refetchDuration := ctx.Duration("refetch-duration")
7178

7279
// import packages from a file
7380
if packagesFile != "" {
74-
return p.Ingester.IngestPackagesFromFile(ctx.Context, packagesFile, ignoreErrors, refetchDuration)
81+
return p.Ingester.IngestPackagesFromFile(ctx.Context, packagesFile, references)
7582
}
7683

7784
if registry {
@@ -82,10 +89,17 @@ func NewCommand(p Params) clifx.CommandResult {
8289
err := errors.New("no package name provided")
8390
return err
8491
}
85-
86-
return p.Ingester.IngestPackageAndDependencies(ctx.Context, packageName, ignoreErrors, refetchDuration)
92+
return p.Ingester.IngestWithDownloadCounts(ctx.Context, packageName)
8793
},
8894
},
95+
//{
96+
// Name: "embedding",
97+
// Flags: []cli.Flag{},
98+
// Action: func(ctx *cli.Context) error {
99+
// _ = ctx.Args().First()
100+
// return p.Processor.GenerateEmbeddingsForReferences()
101+
// },
102+
//},
89103
{
90104
Name: "replicate",
91105
Subcommands: []*cli.Command{

0 commit comments

Comments
 (0)