Skip to content

Commit b980304

Browse files
authored
Merge pull request #1209 from flyinfish/discussions/1206
Add missing result in EvaluationResult
2 parents 271bd2c + f74c1b4 commit b980304

File tree

4 files changed

+87
-20
lines changed

4 files changed

+87
-20
lines changed

testing/scorer/scorer-core/src/main/java/io/quarkiverse/langchain4j/testing/scorer/EvaluationReport.java

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,17 @@ public double scoreForTag(String tag) {
5656
* @throws IOException if an error occurs while writing the report
5757
*/
5858
public void writeReport(File output) throws IOException {
59+
writeReport(output, false);
60+
}
61+
62+
/**
63+
* Write the report to a file using the Markdown syntax.
64+
*
65+
* @param output the output file, must not be {@code null}
66+
* @param includeResult whether to include the expectedOutput and result of the evaluation in the report
67+
* @throws IOException if an error occurs while writing the report
68+
*/
69+
public void writeReport(File output, boolean includeResult) throws IOException {
5970
StringBuilder buffer = new StringBuilder();
6071
buffer.append("# Evaluation Report\n\n");
6172
buffer.append("**Global Score**: ").append(score).append("\n\n");
@@ -69,9 +80,14 @@ public void writeReport(File output) throws IOException {
6980
}
7081

7182
buffer.append("\n## Details\n\n");
83+
var detailHeader = includeResult ? "### " : "- ";
7284
for (Scorer.EvaluationResult<?> evaluation : evaluations) {
73-
buffer.append("- ").append(evaluation.sample().name()).append(": ")
85+
buffer.append(detailHeader).append(evaluation.sample().name()).append(": ")
7486
.append(evaluation.passed() ? "PASSED" : "FAILED").append("\n");
87+
if (includeResult) {
88+
buffer.append("#### Result\n").append(evaluation.result()).append("\n");
89+
buffer.append("#### Expected Output\n").append(evaluation.sample().expectedOutput()).append("\n");
90+
}
7591
}
7692

7793
Files.write(output.toPath(), buffer.toString().getBytes());

testing/scorer/scorer-core/src/main/java/io/quarkiverse/langchain4j/testing/scorer/Scorer.java

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,16 +39,16 @@ public <T> EvaluationReport evaluate(Samples<T> samples, Function<Parameters, T>
3939
var response = execute(sample, function);
4040
LOG.infof("Evaluating sample `%s`", sample.name());
4141
for (EvaluationStrategy<T> strategy : strategies) {
42-
EvaluationResult<T> evaluation = new EvaluationResult<>(sample,
43-
strategy.evaluate(sample, response));
42+
EvaluationResult<T> evaluation = EvaluationResult.fromCompletedEvaluation(sample,
43+
response, strategy.evaluate(sample, response));
4444
LOG.infof("Evaluation of sample `%s` with strategy `%s`: %s", sample.name(),
4545
strategy.getClass().getSimpleName(),
4646
evaluation.passed() ? "OK" : "KO");
4747
evaluations.add(evaluation);
4848
}
4949
} catch (Throwable e) {
5050
LOG.errorf(e, "Failed to evaluate sample `%s`", sample.name());
51-
evaluations.add(new EvaluationResult<>(sample, false));
51+
evaluations.add(EvaluationResult.fromEvaluationThrowable(sample, e));
5252
} finally {
5353
latch.countDown();
5454
}
@@ -66,7 +66,14 @@ public void close() {
6666
executor.shutdown();
6767
}
6868

69-
public record EvaluationResult<T>(EvaluationSample<T> sample, boolean passed) {
69+
public record EvaluationResult<T>(EvaluationSample<T> sample, T result, Throwable thrown, boolean passed) {
70+
public static <T> EvaluationResult<T> fromCompletedEvaluation(EvaluationSample<T> sample, T result, boolean passed) {
71+
return new EvaluationResult<>(sample, result, null, passed);
72+
}
73+
74+
public static <T> EvaluationResult<T> fromEvaluationThrowable(EvaluationSample<T> sample, Throwable thrown) {
75+
return new EvaluationResult<>(sample, null, thrown, false);
76+
}
7077
}
7178

7279
private <T> T execute(EvaluationSample<T> sample, Function<Parameters, T> function) {

testing/scorer/scorer-core/src/test/java/io/quarkiverse/langchain4j/testing/scorer/EvaluationReportTest.java

Lines changed: 51 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
package io.quarkiverse.langchain4j.testing.scorer;
22

3-
import static org.assertj.core.api.Assertions.*;
3+
import static org.assertj.core.api.Assertions.assertThat;
44

55
import java.io.File;
66
import java.io.IOException;
@@ -14,12 +14,14 @@ class EvaluationReportTest {
1414
@Test
1515
void globalScoreShouldBeCorrect() {
1616
// Create mock evaluations.
17-
Scorer.EvaluationResult<String> result1 = new Scorer.EvaluationResult<>(
17+
Scorer.EvaluationResult<String> result1 = Scorer.EvaluationResult.fromCompletedEvaluation(
1818
new EvaluationSample<>("Sample1", new Parameters(), "expected", List.of("tag1")),
19+
"expected",
1920
true);
2021

21-
Scorer.EvaluationResult<String> result2 = new Scorer.EvaluationResult<>(
22+
Scorer.EvaluationResult<String> result2 = Scorer.EvaluationResult.fromCompletedEvaluation(
2223
new EvaluationSample<>("Sample2", new Parameters(), "expected", List.of("tag2")),
24+
"some-response",
2325
false);
2426

2527
EvaluationReport report = new EvaluationReport(List.of(result1, result2));
@@ -31,16 +33,19 @@ void globalScoreShouldBeCorrect() {
3133
@Test
3234
void scoreForTagShouldBeCorrect() {
3335
// Create mock evaluations.
34-
Scorer.EvaluationResult<String> result1 = new Scorer.EvaluationResult<>(
36+
Scorer.EvaluationResult<String> result1 = Scorer.EvaluationResult.fromCompletedEvaluation(
3537
new EvaluationSample<>("Sample1", new Parameters(), "expected", List.of("tag1")),
38+
"expected",
3639
true);
3740

38-
Scorer.EvaluationResult<String> result2 = new Scorer.EvaluationResult<>(
41+
Scorer.EvaluationResult<String> result2 = Scorer.EvaluationResult.fromCompletedEvaluation(
3942
new EvaluationSample<>("Sample2", new Parameters(), "expected", List.of("tag2")),
43+
"some-response",
4044
false);
4145

42-
Scorer.EvaluationResult<String> result3 = new Scorer.EvaluationResult<>(
46+
Scorer.EvaluationResult<String> result3 = Scorer.EvaluationResult.fromCompletedEvaluation(
4347
new EvaluationSample<>("Sample3", new Parameters(), "expected", List.of("tag1", "tag2")),
48+
"expected",
4449
true);
4550

4651
EvaluationReport report = new EvaluationReport(List.of(result1, result2, result3));
@@ -53,12 +58,14 @@ void scoreForTagShouldBeCorrect() {
5358
@Test
5459
void writeReportShouldGenerateMarkdownFile() throws IOException {
5560
// Create mock evaluations.
56-
Scorer.EvaluationResult<String> result1 = new Scorer.EvaluationResult<>(
61+
Scorer.EvaluationResult<String> result1 = Scorer.EvaluationResult.fromCompletedEvaluation(
5762
new EvaluationSample<>("Sample1", new Parameters(), "expected", List.of("tag1")),
63+
"expected",
5864
true);
5965

60-
Scorer.EvaluationResult<String> result2 = new Scorer.EvaluationResult<>(
66+
Scorer.EvaluationResult<String> result2 = Scorer.EvaluationResult.fromCompletedEvaluation(
6167
new EvaluationSample<>("Sample2", new Parameters(), "expected", List.of("tag2")),
68+
"some-response",
6269
false);
6370

6471
EvaluationReport report = new EvaluationReport(List.of(result1, result2));
@@ -79,4 +86,40 @@ void writeReportShouldGenerateMarkdownFile() throws IOException {
7986
assertThat(content).contains("- Sample1: PASSED");
8087
assertThat(content).contains("- Sample2: FAILED");
8188
}
89+
90+
@Test
91+
void writeReportShouldGenerateMarkdownFileIncudingExpectedOutputAndResult() throws IOException {
92+
// Create mock evaluations.
93+
Scorer.EvaluationResult<String> result1 = Scorer.EvaluationResult.fromCompletedEvaluation(
94+
new EvaluationSample<>("Sample1", new Parameters(), "expected1", List.of("tag1")),
95+
"expected1",
96+
true);
97+
98+
Scorer.EvaluationResult<String> result2 = Scorer.EvaluationResult.fromCompletedEvaluation(
99+
new EvaluationSample<>("Sample2", new Parameters(), "expected2", List.of("tag2")),
100+
"some-response",
101+
false);
102+
103+
EvaluationReport report = new EvaluationReport(List.of(result1, result2));
104+
105+
// Write the report to a temporary file.
106+
File tempFile = File.createTempFile("evaluation-report", ".md");
107+
report.writeReport(tempFile, true);
108+
109+
// Assertions
110+
assertThat(tempFile).exists();
111+
String content = Files.readString(tempFile.toPath());
112+
assertThat(content).contains("# Evaluation Report");
113+
assertThat(content).contains("**Global Score**: 50.0");
114+
assertThat(content).contains("## Score per tags");
115+
assertThat(content).contains("- **tag1**: 100.0");
116+
assertThat(content).contains("- **tag2**: 0.0");
117+
assertThat(content).contains("## Details");
118+
assertThat(content).contains("### Sample1: PASSED");
119+
assertThat(content).contains("#### Result\nexpected1");
120+
assertThat(content).contains("#### Expected Output\nexpected1");
121+
assertThat(content).contains("### Sample2: FAILED");
122+
assertThat(content).contains("#### Result\nsome-response");
123+
assertThat(content).contains("#### Expected Output\nexpected2");
124+
}
82125
}

testing/scorer/scorer-core/src/test/java/io/quarkiverse/langchain4j/testing/scorer/ScorerTest.java

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ void evaluateShouldReturnCorrectReport() {
2727
EvaluationSample<String> sample1 = new EvaluationSample<>(
2828
"Sample1",
2929
new Parameters().add(new Parameter.UnnamedParameter("param1")),
30-
"expected1",
30+
"expected1:param1",
3131
List.of("tag1", "tag2"));
3232

3333
EvaluationSample<String> sample2 = new EvaluationSample<>(
@@ -36,7 +36,7 @@ void evaluateShouldReturnCorrectReport() {
3636
"expected2",
3737
List.of("tag2"));
3838

39-
Function<Parameters, String> mockFunction = params -> "expected1";
39+
Function<Parameters, String> mockFunction = params -> "expected1:param1";
4040
EvaluationStrategy<String> strategy = (sample, actual) -> actual.equals(sample.expectedOutput());
4141

4242
Samples<String> samples = new Samples<>(sample1, sample2);
@@ -46,11 +46,12 @@ void evaluateShouldReturnCorrectReport() {
4646
assertThat(report.score()).isEqualTo(50.0); // Only one sample should pass.
4747
assertThat(report.evaluations()).hasSize(2);
4848

49-
Scorer.EvaluationResult<?> result1 = report.evaluations().get(0);
50-
assertThat(result1.passed()).isTrue();
51-
52-
Scorer.EvaluationResult<?> result2 = report.evaluations().get(1);
53-
assertThat(result2.passed()).isFalse();
49+
var actualEvaluations = report.evaluations().stream()
50+
.map(e -> "%s[%s;%s=%s]".formatted(e.sample().name(), e.sample().expectedOutput(), e.result(), e.passed()))
51+
.toList();
52+
assertThat(actualEvaluations).containsExactlyInAnyOrder(
53+
"Sample1[expected1:param1;expected1:param1=true]",
54+
"Sample2[expected2;expected1:param1=false]");
5455
}
5556

5657
@Test

0 commit comments

Comments
 (0)