Skip to content

Commit 5a1c2ff

Browse files
pokeysketch
andcommitted
Add llms.txt
- Fixes #2873 Co-Authored-By: sketch <hello@sketch.dev> Change-ID: s2e8abba011e787cbk
1 parent 97bf1d6 commit 5a1c2ff

File tree

4 files changed

+221
-105
lines changed

4 files changed

+221
-105
lines changed

packages/cursorless-org/README.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,16 @@ To learn more about Next.js, take a look at the following resources:
2020
- [Learn Next.js](https://nextjs.org/learn) - an interactive Next.js tutorial.
2121

2222
You can check out [the Next.js GitHub repository](https://github.com/vercel/next.js/) - your feedback and contributions are welcome!
23+
24+
25+
## LLMs Training Data
26+
27+
During the build process, an `llms.txt` file is automatically generated in the output directory. This file contains a concatenation of all markdown files from the documentation directory (`packages/cursorless-org-docs/src/docs/`) and is used for LLM training and reference.
28+
29+
You can find this file at `out/llms.txt` after running the build process.
30+
31+
To generate this file manually without running a full build, you can run:
32+
33+
```sh
34+
pnpm generate-llms
35+
```

packages/cursorless-org/package.json

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,17 @@
55
"private": true,
66
"scripts": {
77
"dev": "next dev",
8-
"build": "next build",
8+
"build": "next build && pnpm generate-llms",
99
"start": "http-server out -a 127.0.0.1 -p 8080",
1010
"lint": "next lint",
1111
"compile": "tsc --build",
1212
"watch": "tsc --build --watch",
13-
"clean": "rm -rf ./out tsconfig.tsbuildinfo ./dist ./build"
13+
"clean": "rm -rf ./out tsconfig.tsbuildinfo ./dist ./build",
14+
"generate-llms": "my-ts-node ./src/scripts/generateLlmsTxt.ts"
1415
},
1516
"dependencies": {
1617
"@cursorless/cheatsheet": "workspace:*",
18+
"@cursorless/common": "workspace:*",
1719
"@mdx-js/loader": "3.0.1",
1820
"@mdx-js/react": "3.0.1",
1921
"@next/mdx": "15.3.2",
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
#!/usr/bin/env node
2+
3+
/**
4+
* Script to generate llms.txt by concatenating all markdown files in the docs directory.
5+
* This file is used for LLM training and reference.
6+
*/
7+
8+
import fs from "fs/promises";
9+
import path from "path";
10+
import { fileURLToPath } from "url";
11+
12+
// For CommonJS compatibility
13+
declare const __dirname: string;
14+
15+
/**
16+
* Recursively find all markdown files in a directory
17+
* @param dir The directory to search
18+
* @returns Array of file paths
19+
*/
20+
async function findMarkdownFiles(dir: string): Promise<string[]> {
21+
const entries = await fs.readdir(dir, { withFileTypes: true });
22+
const filesPromises = entries.map(async (entry) => {
23+
const entryPath = path.join(dir, entry.name);
24+
if (entry.isDirectory()) {
25+
return findMarkdownFiles(entryPath);
26+
} else if (entry.isFile() && (entry.name.endsWith(".md") || entry.name.endsWith(".mdx"))) {
27+
return [entryPath];
28+
}
29+
return [];
30+
});
31+
32+
const files = await Promise.all(filesPromises);
33+
return files.flat();
34+
}
35+
36+
/**
37+
* Main function to generate llms.txt
38+
*/
39+
export async function generateLlmsTxt(): Promise<void> {
40+
// Get the directory of the source file
41+
let scriptDir: string;
42+
try {
43+
// ESM approach
44+
const __filename = fileURLToPath(import.meta.url);
45+
scriptDir = path.dirname(__filename);
46+
} catch (e) {
47+
// Fallback for CommonJS
48+
scriptDir = __dirname;
49+
}
50+
51+
// Get repo root from environment variable or fallback to relative path
52+
const repoRoot = process.env.CURSORLESS_REPO_ROOT || path.resolve(scriptDir, "../../../");
53+
54+
// Path to docs directory and output file
55+
const docsDir = path.resolve(repoRoot, "packages/cursorless-org-docs/src/docs");
56+
const outputFile = path.resolve(scriptDir, "../../out/llms.txt");
57+
58+
console.log(`Generating llms.txt from markdown files in ${docsDir}`);
59+
60+
try {
61+
// Create the output directory if it doesn't exist
62+
try {
63+
await fs.mkdir(path.dirname(outputFile), { recursive: true });
64+
} catch (err: any) {
65+
if (err.code !== "EEXIST") throw err;
66+
}
67+
68+
// Find all markdown files
69+
const markdownFiles = await findMarkdownFiles(docsDir);
70+
console.log(`Found ${markdownFiles.length} markdown files`);
71+
72+
// Read and concatenate all markdown files
73+
let llmsContent = "# Cursorless Documentation\n\nThis file is auto-generated from all Markdown files in the Cursorless documentation.\n";
74+
75+
for (const filePath of markdownFiles) {
76+
// Add file path as a comment for reference
77+
const relativePath = path.relative(repoRoot, filePath);
78+
llmsContent += `\n\n<!-- File: ${relativePath} -->\n\n`;
79+
80+
// Read and append the file content
81+
const fileContent = await fs.readFile(filePath, "utf8");
82+
llmsContent += fileContent;
83+
}
84+
85+
// Write the concatenated content to llms.txt
86+
await fs.writeFile(outputFile, llmsContent.trim());
87+
88+
console.log(`Successfully generated llms.txt at ${outputFile}`);
89+
} catch (error) {
90+
console.error("Error generating llms.txt:", error);
91+
process.exit(1);
92+
}
93+
}
94+
95+
// Run the main function directly
96+
generateLlmsTxt();

0 commit comments

Comments
 (0)