diff --git a/.claude/settings.local.json b/.claude/settings.local.json new file mode 100644 index 00000000..85429d0c --- /dev/null +++ b/.claude/settings.local.json @@ -0,0 +1,13 @@ +{ + "permissions": { + "allow": [ + "Bash(grep:*)", + "Bash(rg:*)", + "Bash(cargo test:*)", + "Bash(cargo run:*)", + "Bash(cargo check:*)", + "Bash(cargo fmt:*)" + ], + "deny": [] + } +} \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index 41f807d1..074ed19b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -334,6 +334,12 @@ version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" +[[package]] +name = "beef" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a8241f3ebb85c056b509d4327ad0358fbbba6ffb340bf388f26350aeda225b1" + [[package]] name = "bindgen" version = "0.66.1" @@ -747,9 +753,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.2.3" +version = "1.2.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27f657647bcff5394bf56c7317665bbf790a137a50eaaa5c6bfbb9e27a518f2d" +checksum = "d487aa071b5f64da6f19a3e848e3578944b726ee5a4854b82172f02aa876bfdc" dependencies = [ "shlex", ] @@ -1363,6 +1369,12 @@ dependencies = [ "spin", ] +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + [[package]] name = "form_urlencoded" version = "1.2.1" @@ -2113,6 +2125,40 @@ dependencies = [ "value-bag", ] +[[package]] +name = "logos" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab6f536c1af4c7cc81edf73da1f8029896e7e1e16a219ef09b184e76a296f3db" +dependencies = [ + "logos-derive", +] + +[[package]] +name = "logos-codegen" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "189bbfd0b61330abea797e5e9276408f2edbe4f822d7ad08685d67419aafb34e" +dependencies = [ + "beef", + "fnv", + "lazy_static", + "proc-macro2", + "quote", + "regex-syntax 0.8.5", + "rustc_version", + "syn 2.0.90", +] + +[[package]] +name = "logos-derive" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebfe8e1a19049ddbfccbd14ac834b215e11b85b90bab0c2dba7c7b92fb5d5cba" +dependencies = [ + "logos-codegen", +] + [[package]] name = "lsp-types" version = "0.94.1" @@ -2160,6 +2206,28 @@ dependencies = [ "autocfg", ] +[[package]] +name = "miette" +version = "7.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f98efec8807c63c752b5bd61f862c165c115b0a35685bdcfd9238c7aeb592b7" +dependencies = [ + "cfg-if", + "miette-derive", + "unicode-width", +] + +[[package]] +name = "miette-derive" +version = "7.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db5b29714e950dbb20d5e6f74f9dcec4edbcc1067bb7f8ed198c097b8c1a818b" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.90", +] + [[package]] name = "mimalloc" version = "0.1.43" @@ -2694,20 +2762,23 @@ name = "pgt_lexer" version = "0.0.0" dependencies = [ "insta", - "pg_query", "pgt_diagnostics", "pgt_lexer_codegen", "pgt_text_size", - "regex", + "pgt_tokenizer", ] [[package]] name = "pgt_lexer_codegen" version = "0.0.0" dependencies = [ - "pgt_query_proto_parser", + "anyhow", + "convert_case", "proc-macro2", + "prost-reflect", + "protox", "quote", + "ureq", ] [[package]] @@ -2755,20 +2826,9 @@ dependencies = [ "petgraph", "pg_query", "pgt_diagnostics", - "pgt_lexer", - "pgt_query_ext_codegen", "pgt_text_size", ] -[[package]] -name = "pgt_query_ext_codegen" -version = "0.0.0" -dependencies = [ - "pgt_query_proto_parser", - "proc-macro2", - "quote", -] - [[package]] name = "pgt_query_proto_parser" version = "0.0.0" @@ -2851,6 +2911,13 @@ dependencies = [ "static_assertions", ] +[[package]] +name = "pgt_tokenizer" +version = "0.0.0" +dependencies = [ + "insta", +] + [[package]] name = "pgt_treesitter_queries" version = "0.0.0" @@ -3194,6 +3261,18 @@ dependencies = [ "syn 2.0.90", ] +[[package]] +name = "prost-reflect" +version = "0.15.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37587d5a8a1b3dc9863403d084fc2254b91ab75a702207098837950767e2260b" +dependencies = [ + "logos", + "miette", + "prost", + "prost-types", +] + [[package]] name = "prost-types" version = "0.13.5" @@ -3239,6 +3318,33 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "protox" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "424c2bd294b69c49b949f3619362bc3c5d28298cd1163b6d1a62df37c16461aa" +dependencies = [ + "bytes", + "miette", + "prost", + "prost-reflect", + "prost-types", + "protox-parse", + "thiserror 2.0.6", +] + +[[package]] +name = "protox-parse" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57927f9dbeeffcce7192404deee6157a640cbb3fe8ac11eabbe571565949ab75" +dependencies = [ + "logos", + "miette", + "prost-types", + "thiserror 2.0.6", +] + [[package]] name = "pulldown-cmark" version = "0.12.2" @@ -3405,6 +3511,20 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" +[[package]] +name = "ring" +version = "0.17.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" +dependencies = [ + "cc", + "cfg-if", + "getrandom", + "libc", + "untrusted", + "windows-sys 0.52.0", +] + [[package]] name = "rsa" version = "0.9.7" @@ -3458,6 +3578,15 @@ version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c7fb8039b3032c191086b10f11f319a6e99e1e82889c5cc6046f515c9db1d497" +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + [[package]] name = "rustix" version = "0.37.28" @@ -3485,6 +3614,41 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "rustls" +version = "0.23.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7160e3e10bf4535308537f3c4e1641468cd0e485175d6163087c0393c7d46643" +dependencies = [ + "log", + "once_cell", + "ring", + "rustls-pki-types", + "rustls-webpki", + "subtle", + "zeroize", +] + +[[package]] +name = "rustls-pki-types" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "229a4a4c221013e7e1f1a043678c5cc39fe5171437c88fb47151a21e6f5b5c79" +dependencies = [ + "zeroize", +] + +[[package]] +name = "rustls-webpki" +version = "0.103.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4a72fe2bcf7a6ac6fd7d0b9e5cb68aeb7d4c0a0271730218b3e92d43b4eb435" +dependencies = [ + "ring", + "rustls-pki-types", + "untrusted", +] + [[package]] name = "rustversion" version = "1.0.20" @@ -3539,6 +3703,12 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" +[[package]] +name = "semver" +version = "1.0.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56e6fa9c48d24d85fb3de5ad847117517440f6beceb7798af16b4a87d616b8d0" + [[package]] name = "serde" version = "1.0.215" @@ -4647,6 +4817,28 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + +[[package]] +name = "ureq" +version = "2.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02d1a66277ed75f640d608235660df48c8e3c19f3b4edb6a263315626cc3c01d" +dependencies = [ + "base64", + "flate2", + "log", + "once_cell", + "rustls", + "rustls-pki-types", + "url", + "webpki-roots 0.26.11", +] + [[package]] name = "url" version = "2.5.4" @@ -4844,6 +5036,24 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "webpki-roots" +version = "0.26.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "521bc38abb08001b01866da9f51eb7c5d647a19260e00054a8c7fd5f9e57f7a9" +dependencies = [ + "webpki-roots 1.0.1", +] + +[[package]] +name = "webpki-roots" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8782dd5a41a24eed3a4f40b606249b3e236ca61adf1f25ea4d45c73de122b502" +dependencies = [ + "rustls-pki-types", +] + [[package]] name = "which" version = "4.4.2" diff --git a/Cargo.toml b/Cargo.toml index fe00d7ca..b5d6dd01 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -43,6 +43,9 @@ slotmap = "1.0.7" smallvec = { version = "1.13.2", features = ["union", "const_new", "serde"] } strum = { version = "0.27.1", features = ["derive"] } # this will use tokio if available, otherwise async-std +convert_case = "0.6.0" +prost-reflect = "0.15.3" +protox = "0.8.0" sqlx = { version = "0.8.2", features = ["runtime-tokio", "runtime-async-std", "postgres", "json"] } syn = "1.0.109" termcolor = "1.4.1" @@ -72,12 +75,12 @@ pgt_lexer_codegen = { path = "./crates/pgt_lexer_codegen", version = "0 pgt_lsp = { path = "./crates/pgt_lsp", version = "0.0.0" } pgt_markup = { path = "./crates/pgt_markup", version = "0.0.0" } pgt_query_ext = { path = "./crates/pgt_query_ext", version = "0.0.0" } -pgt_query_ext_codegen = { path = "./crates/pgt_query_ext_codegen", version = "0.0.0" } pgt_query_proto_parser = { path = "./crates/pgt_query_proto_parser", version = "0.0.0" } pgt_schema_cache = { path = "./crates/pgt_schema_cache", version = "0.0.0" } pgt_statement_splitter = { path = "./crates/pgt_statement_splitter", version = "0.0.0" } pgt_text_edit = { path = "./crates/pgt_text_edit", version = "0.0.0" } pgt_text_size = { path = "./crates/pgt_text_size", version = "0.0.0" } +pgt_tokenizer = { path = "./crates/pgt_tokenizer", version = "0.0.0" } pgt_treesitter_queries = { path = "./crates/pgt_treesitter_queries", version = "0.0.0" } pgt_typecheck = { path = "./crates/pgt_typecheck", version = "0.0.0" } pgt_workspace = { path = "./crates/pgt_workspace", version = "0.0.0" } diff --git a/crates/pgt_diagnostics/src/display/message.rs b/crates/pgt_diagnostics/src/display/message.rs index 3cf9be3f..20c039a9 100644 --- a/crates/pgt_diagnostics/src/display/message.rs +++ b/crates/pgt_diagnostics/src/display/message.rs @@ -47,6 +47,15 @@ impl From for MessageAndDescription { } } +impl From<&str> for MessageAndDescription { + fn from(description: &str) -> Self { + Self { + message: markup! { {description} }.to_owned(), + description: description.into(), + } + } +} + impl From for MessageAndDescription { fn from(message: MarkupBuf) -> Self { let description = markup_to_string(&message); diff --git a/crates/pgt_lexer/Cargo.toml b/crates/pgt_lexer/Cargo.toml index 4b218588..7f4ada43 100644 --- a/crates/pgt_lexer/Cargo.toml +++ b/crates/pgt_lexer/Cargo.toml @@ -12,16 +12,12 @@ version = "0.0.0" [dependencies] -regex = "1.9.1" - -pg_query.workspace = true pgt_diagnostics.workspace = true pgt_lexer_codegen.workspace = true - -pgt_text_size.workspace = true +pgt_text_size.workspace = true +pgt_tokenizer.workspace = true [dev-dependencies] insta.workspace = true [lib] -doctest = false diff --git a/crates/pgt_lexer/README.md b/crates/pgt_lexer/README.md index ec61c7b2..57bdaa34 100644 --- a/crates/pgt_lexer/README.md +++ b/crates/pgt_lexer/README.md @@ -1,8 +1 @@ -# pgt_lexer - -The `pgt_lexer` crate exposes the `lex` method, which turns an SQL query text into a `Vec>`: the base for the `pg_parser` and most of pgtools's operations. - -A token is always of a certain `SyntaxKind` kind. That `SyntaxKind` enum is derived from `libpg_query`'s protobuf file. - -The SQL query text is mostly lexed using the `pg_query::scan` method (`pg_query` is just a Rust wrapper around `libpg_query`). -However, that method does not parse required whitespace tokens, so the `lex` method takes care of parsing those and merging them into the result. +Heavily inspired by and copied from [squawk_parser](https://github.com/sbdchd/squawk/tree/9acfecbbb7f3c7eedcbaf060e7b25f9afa136db3/crates/squawk_parser). Thanks for making all the hard work MIT-licensed! diff --git a/crates/pgt_lexer/src/codegen.rs b/crates/pgt_lexer/src/codegen.rs deleted file mode 100644 index 6c750590..00000000 --- a/crates/pgt_lexer/src/codegen.rs +++ /dev/null @@ -1,3 +0,0 @@ -use pgt_lexer_codegen::lexer_codegen; - -lexer_codegen!(); diff --git a/crates/pgt_lexer/src/codegen/mod.rs b/crates/pgt_lexer/src/codegen/mod.rs new file mode 100644 index 00000000..c4e67bc5 --- /dev/null +++ b/crates/pgt_lexer/src/codegen/mod.rs @@ -0,0 +1 @@ +pub mod syntax_kind; diff --git a/crates/pgt_lexer/src/codegen/syntax_kind.rs b/crates/pgt_lexer/src/codegen/syntax_kind.rs new file mode 100644 index 00000000..f50398ec --- /dev/null +++ b/crates/pgt_lexer/src/codegen/syntax_kind.rs @@ -0,0 +1 @@ +pgt_lexer_codegen::syntax_kind_codegen!(); diff --git a/crates/pgt_lexer/src/diagnostics.rs b/crates/pgt_lexer/src/diagnostics.rs deleted file mode 100644 index 9516387a..00000000 --- a/crates/pgt_lexer/src/diagnostics.rs +++ /dev/null @@ -1,67 +0,0 @@ -use pgt_diagnostics::{Diagnostic, MessageAndDescription}; -use pgt_text_size::TextRange; - -/// A specialized diagnostic for scan errors. -/// -/// Scan diagnostics are always **fatal errors**. -#[derive(Clone, Debug, Diagnostic, PartialEq)] -#[diagnostic(category = "syntax", severity = Fatal)] -pub struct ScanError { - /// The location where the error is occurred - #[location(span)] - span: Option, - #[message] - #[description] - pub message: MessageAndDescription, -} - -impl ScanError { - pub fn from_pg_query_err(err: pg_query::Error, input: &str) -> Vec { - let err_msg = err.to_string(); - let re = regex::Regex::new(r#"at or near "(.*?)""#).unwrap(); - let mut diagnostics = Vec::new(); - - for captures in re.captures_iter(&err_msg) { - if let Some(matched) = captures.get(1) { - let search_term = matched.as_str(); - for (idx, _) in input.match_indices(search_term) { - let from = idx; - let to = from + search_term.len(); - diagnostics.push(ScanError { - span: Some(TextRange::new( - from.try_into().unwrap(), - to.try_into().unwrap(), - )), - message: MessageAndDescription::from(err_msg.clone()), - }); - } - } - } - - if diagnostics.is_empty() { - diagnostics.push(ScanError { - span: None, - message: MessageAndDescription::from(err_msg), - }); - } - - diagnostics - } -} - -#[cfg(test)] -mod tests { - use crate::lex; - - #[test] - fn finds_all_occurrences() { - let input = - "select 1443ddwwd33djwdkjw13331333333333; select 1443ddwwd33djwdkjw13331333333333;"; - let diagnostics = lex(input).unwrap_err(); - assert_eq!(diagnostics.len(), 2); - assert_eq!(diagnostics[0].span.unwrap().start(), 7.into()); - assert_eq!(diagnostics[0].span.unwrap().end(), 39.into()); - assert_eq!(diagnostics[1].span.unwrap().start(), 48.into()); - assert_eq!(diagnostics[1].span.unwrap().end(), 80.into()); - } -} diff --git a/crates/pgt_lexer/src/lexed.rs b/crates/pgt_lexer/src/lexed.rs new file mode 100644 index 00000000..6f0a273f --- /dev/null +++ b/crates/pgt_lexer/src/lexed.rs @@ -0,0 +1,107 @@ +use pgt_diagnostics::{Diagnostic, MessageAndDescription}; +use pgt_text_size::TextRange; + +use crate::SyntaxKind; + +/// Internal error type used during lexing +#[derive(Debug, Clone)] +pub struct LexError { + pub msg: String, + pub token: u32, +} + +/// A specialized diagnostic for lex errors. +#[derive(Clone, Debug, Diagnostic, PartialEq)] +#[diagnostic(category = "syntax", severity = Error)] +pub struct LexDiagnostic { + /// The location where the error is occurred + #[location(span)] + pub span: TextRange, + #[message] + #[description] + pub message: MessageAndDescription, +} + +/// Result of lexing a string, providing access to tokens and diagnostics +pub struct Lexed<'a> { + pub(crate) text: &'a str, + pub(crate) kind: Vec, + pub(crate) start: Vec, + pub(crate) error: Vec, + pub(crate) line_ending_counts: Vec, +} + +impl Lexed<'_> { + /// Returns the number of tokens + pub fn len(&self) -> usize { + self.kind.len() + } + + /// Returns true if there are no tokens + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Returns an iterator over token kinds + pub fn tokens(&self) -> impl Iterator + '_ { + self.kind.iter().copied() + } + + /// Returns the kind of token at the given index + pub fn kind(&self, idx: usize) -> SyntaxKind { + assert!( + idx < self.len(), + "expected index < {}, got {}", + self.len(), + idx + ); + self.kind[idx] + } + + /// Returns the number of line endings in the token at the given index + pub fn line_ending_count(&self, idx: usize) -> usize { + assert!( + idx < self.len(), + "expected index < {}, got {}", + self.len(), + idx + ); + assert!(self.kind(idx) == SyntaxKind::LINE_ENDING); + self.line_ending_counts[idx] + } + + /// Returns the text range of token at the given index + pub fn range(&self, idx: usize) -> TextRange { + self.text_range(idx) + } + + /// Returns the text of token at the given index + pub fn text(&self, idx: usize) -> &str { + self.range_text(idx..idx + 1) + } + + /// Returns all lexing errors with their text ranges + pub fn errors(&self) -> Vec { + self.error + .iter() + .map(|it| LexDiagnostic { + message: it.msg.as_str().into(), + span: self.text_range(it.token as usize), + }) + .collect() + } + + pub(crate) fn text_range(&self, i: usize) -> TextRange { + assert!(i < self.len()); + let lo = self.start[i]; + let hi = self.start[i + 1]; + TextRange::new(lo.into(), hi.into()) + } + + fn range_text(&self, r: std::ops::Range) -> &str { + assert!(r.start < r.end && r.end <= self.len()); + let lo = self.start[r.start] as usize; + let hi = self.start[r.end] as usize; + &self.text[lo..hi] + } +} diff --git a/crates/pgt_lexer/src/lexer.rs b/crates/pgt_lexer/src/lexer.rs new file mode 100644 index 00000000..db4b4ae2 --- /dev/null +++ b/crates/pgt_lexer/src/lexer.rs @@ -0,0 +1,208 @@ +use pgt_tokenizer::tokenize; + +use crate::SyntaxKind; +use crate::lexed::{LexError, Lexed}; + +/// Lexer that processes input text into tokens and diagnostics +pub struct Lexer<'a> { + text: &'a str, + kind: Vec, + start: Vec, + error: Vec, + offset: usize, + /// we store line ending counts outside of SyntaxKind because of the u16 represenation of SyntaxKind + line_ending_counts: Vec, +} + +impl<'a> Lexer<'a> { + /// Create a new lexer for the given text + pub fn new(text: &'a str) -> Self { + Self { + text, + kind: Vec::new(), + start: Vec::new(), + error: Vec::new(), + offset: 0, + line_ending_counts: Vec::new(), + } + } + + /// Lex the input text and return the result + pub fn lex(mut self) -> Lexed<'a> { + for token in tokenize(&self.text[self.offset..]) { + let token_text = &self.text[self.offset..][..token.len as usize]; + self.extend_token(&token.kind, token_text); + } + + // Add EOF token + self.push(SyntaxKind::EOF, 0, None, None); + + Lexed { + text: self.text, + kind: self.kind, + start: self.start, + error: self.error, + line_ending_counts: self.line_ending_counts, + } + } + + fn push( + &mut self, + kind: SyntaxKind, + len: usize, + err: Option<&str>, + line_ending_count: Option, + ) { + self.kind.push(kind); + self.start.push(self.offset as u32); + self.offset += len; + + assert!( + kind != SyntaxKind::LINE_ENDING || line_ending_count.is_some(), + "Line ending token must have a line ending count" + ); + + self.line_ending_counts.push(line_ending_count.unwrap_or(0)); + + if let Some(err) = err { + let token = (self.kind.len() - 1) as u32; + let msg = err.to_owned(); + self.error.push(LexError { msg, token }); + } + } + + fn extend_token(&mut self, kind: &pgt_tokenizer::TokenKind, token_text: &str) { + let mut err = ""; + let mut line_ending_count = None; + + let syntax_kind = { + match kind { + pgt_tokenizer::TokenKind::LineComment => SyntaxKind::COMMENT, + pgt_tokenizer::TokenKind::BlockComment { terminated } => { + if !terminated { + err = "Missing trailing `*/` symbols to terminate the block comment"; + } + SyntaxKind::COMMENT + } + pgt_tokenizer::TokenKind::Space => SyntaxKind::SPACE, + pgt_tokenizer::TokenKind::Tab => SyntaxKind::TAB, + pgt_tokenizer::TokenKind::LineEnding { count } => { + line_ending_count = Some(*count); + SyntaxKind::LINE_ENDING + } + pgt_tokenizer::TokenKind::VerticalTab => SyntaxKind::VERTICAL_TAB, + pgt_tokenizer::TokenKind::FormFeed => SyntaxKind::FORM_FEED, + pgt_tokenizer::TokenKind::Ident => { + SyntaxKind::from_keyword(token_text).unwrap_or(SyntaxKind::IDENT) + } + pgt_tokenizer::TokenKind::Literal { kind, .. } => { + self.extend_literal(token_text.len(), kind); + return; + } + pgt_tokenizer::TokenKind::Semi => SyntaxKind::SEMICOLON, + pgt_tokenizer::TokenKind::Comma => SyntaxKind::COMMA, + pgt_tokenizer::TokenKind::Dot => SyntaxKind::DOT, + pgt_tokenizer::TokenKind::OpenParen => SyntaxKind::L_PAREN, + pgt_tokenizer::TokenKind::CloseParen => SyntaxKind::R_PAREN, + pgt_tokenizer::TokenKind::OpenBracket => SyntaxKind::L_BRACK, + pgt_tokenizer::TokenKind::CloseBracket => SyntaxKind::R_BRACK, + pgt_tokenizer::TokenKind::At => SyntaxKind::AT, + pgt_tokenizer::TokenKind::Pound => SyntaxKind::POUND, + pgt_tokenizer::TokenKind::Tilde => SyntaxKind::TILDE, + pgt_tokenizer::TokenKind::Question => SyntaxKind::QUESTION, + pgt_tokenizer::TokenKind::Colon => SyntaxKind::COLON, + pgt_tokenizer::TokenKind::Eq => SyntaxKind::EQ, + pgt_tokenizer::TokenKind::Bang => SyntaxKind::BANG, + pgt_tokenizer::TokenKind::Lt => SyntaxKind::L_ANGLE, + pgt_tokenizer::TokenKind::Gt => SyntaxKind::R_ANGLE, + pgt_tokenizer::TokenKind::Minus => SyntaxKind::MINUS, + pgt_tokenizer::TokenKind::And => SyntaxKind::AMP, + pgt_tokenizer::TokenKind::Or => SyntaxKind::PIPE, + pgt_tokenizer::TokenKind::Plus => SyntaxKind::PLUS, + pgt_tokenizer::TokenKind::Star => SyntaxKind::STAR, + pgt_tokenizer::TokenKind::Slash => SyntaxKind::SLASH, + pgt_tokenizer::TokenKind::Caret => SyntaxKind::CARET, + pgt_tokenizer::TokenKind::Percent => SyntaxKind::PERCENT, + pgt_tokenizer::TokenKind::Unknown => SyntaxKind::ERROR, + pgt_tokenizer::TokenKind::Backslash => SyntaxKind::BACKSLASH, + pgt_tokenizer::TokenKind::UnknownPrefix => { + err = "unknown literal prefix"; + SyntaxKind::IDENT + } + pgt_tokenizer::TokenKind::Eof => SyntaxKind::EOF, + pgt_tokenizer::TokenKind::Backtick => SyntaxKind::BACKTICK, + pgt_tokenizer::TokenKind::PositionalParam => SyntaxKind::POSITIONAL_PARAM, + pgt_tokenizer::TokenKind::QuotedIdent { terminated } => { + if !terminated { + err = "Missing trailing \" to terminate the quoted identifier" + } + SyntaxKind::IDENT + } + } + }; + + let err = if err.is_empty() { None } else { Some(err) }; + self.push(syntax_kind, token_text.len(), err, line_ending_count); + } + + fn extend_literal(&mut self, len: usize, kind: &pgt_tokenizer::LiteralKind) { + let mut err = ""; + + let syntax_kind = match *kind { + pgt_tokenizer::LiteralKind::Int { empty_int, base: _ } => { + if empty_int { + err = "Missing digits after the integer base prefix"; + } + SyntaxKind::INT_NUMBER + } + pgt_tokenizer::LiteralKind::Float { + empty_exponent, + base: _, + } => { + if empty_exponent { + err = "Missing digits after the exponent symbol"; + } + SyntaxKind::FLOAT_NUMBER + } + pgt_tokenizer::LiteralKind::Str { terminated } => { + if !terminated { + err = "Missing trailing `'` symbol to terminate the string literal"; + } + SyntaxKind::STRING + } + pgt_tokenizer::LiteralKind::ByteStr { terminated } => { + if !terminated { + err = "Missing trailing `'` symbol to terminate the hex bit string literal"; + } + SyntaxKind::BYTE_STRING + } + pgt_tokenizer::LiteralKind::BitStr { terminated } => { + if !terminated { + err = "Missing trailing `'` symbol to terminate the bit string literal"; + } + SyntaxKind::BIT_STRING + } + pgt_tokenizer::LiteralKind::DollarQuotedString { terminated } => { + if !terminated { + err = "Unterminated dollar quoted string literal"; + } + SyntaxKind::DOLLAR_QUOTED_STRING + } + pgt_tokenizer::LiteralKind::UnicodeEscStr { terminated } => { + if !terminated { + err = "Missing trailing `'` symbol to terminate the unicode escape string literal"; + } + SyntaxKind::BYTE_STRING + } + pgt_tokenizer::LiteralKind::EscStr { terminated } => { + if !terminated { + err = "Missing trailing `'` symbol to terminate the escape string literal"; + } + SyntaxKind::ESC_STRING + } + }; + + let err = if err.is_empty() { None } else { Some(err) }; + self.push(syntax_kind, len, err, None); + } +} diff --git a/crates/pgt_lexer/src/lib.rs b/crates/pgt_lexer/src/lib.rs index 32bbdd42..2d8779a7 100644 --- a/crates/pgt_lexer/src/lib.rs +++ b/crates/pgt_lexer/src/lib.rs @@ -1,191 +1,14 @@ mod codegen; -pub mod diagnostics; +mod lexed; +mod lexer; -use diagnostics::ScanError; -use pg_query::protobuf::{KeywordKind, ScanToken}; -use pgt_text_size::{TextLen, TextRange, TextSize}; -use regex::Regex; -use std::{collections::VecDeque, sync::LazyLock}; +pub use crate::codegen::syntax_kind::SyntaxKind; +pub use crate::lexed::{LexDiagnostic, Lexed}; +pub use crate::lexer::Lexer; -pub use crate::codegen::SyntaxKind; - -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum TokenType { - Whitespace, - NoKeyword, - UnreservedKeyword, - ColNameKeyword, - TypeFuncNameKeyword, - ReservedKeyword, -} - -impl From<&ScanToken> for TokenType { - fn from(token: &ScanToken) -> TokenType { - match token.token { - // SqlComment | CComment - 275 | 276 => TokenType::Whitespace, - _ => match token.keyword_kind() { - KeywordKind::NoKeyword => TokenType::NoKeyword, - KeywordKind::UnreservedKeyword => TokenType::UnreservedKeyword, - KeywordKind::ColNameKeyword => TokenType::ColNameKeyword, - KeywordKind::TypeFuncNameKeyword => TokenType::TypeFuncNameKeyword, - KeywordKind::ReservedKeyword => TokenType::ReservedKeyword, - }, - } - } -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct Token { - pub kind: SyntaxKind, - pub text: String, - pub span: TextRange, - pub token_type: TokenType, -} - -impl Token { - pub fn eof(pos: usize) -> Token { - Token { - kind: SyntaxKind::Eof, - text: "".to_string(), - span: TextRange::at(TextSize::try_from(pos).unwrap(), TextSize::from(0)), - token_type: TokenType::Whitespace, - } - } -} - -pub static WHITESPACE_TOKENS: &[SyntaxKind] = &[ - SyntaxKind::Whitespace, - SyntaxKind::Tab, - SyntaxKind::Newline, - SyntaxKind::SqlComment, - SyntaxKind::CComment, -]; - -static PATTERN_LEXER: LazyLock = LazyLock::new(|| { - #[cfg(windows)] - { - // On Windows, treat \r\n as a single newline token - Regex::new(r"(?P +)|(?P(\r\n|\n)+)|(?P\t+)").unwrap() - } - #[cfg(not(windows))] - { - // On other platforms, just check for \n - Regex::new(r"(?P +)|(?P\n+)|(?P\t+)").unwrap() - } -}); - -fn whitespace_tokens(input: &str) -> VecDeque { - let mut tokens = VecDeque::new(); - - for cap in PATTERN_LEXER.captures_iter(input) { - if let Some(whitespace) = cap.name("whitespace") { - tokens.push_back(Token { - token_type: TokenType::Whitespace, - kind: SyntaxKind::Whitespace, - text: whitespace.as_str().to_string(), - span: TextRange::new( - TextSize::from(u32::try_from(whitespace.start()).unwrap()), - TextSize::from(u32::try_from(whitespace.end()).unwrap()), - ), - }); - } else if let Some(newline) = cap.name("newline") { - tokens.push_back(Token { - token_type: TokenType::Whitespace, - kind: SyntaxKind::Newline, - text: newline.as_str().to_string(), - span: TextRange::new( - TextSize::from(u32::try_from(newline.start()).unwrap()), - TextSize::from(u32::try_from(newline.end()).unwrap()), - ), - }); - } else if let Some(tab) = cap.name("tab") { - tokens.push_back(Token { - token_type: TokenType::Whitespace, - kind: SyntaxKind::Tab, - text: tab.as_str().to_string(), - span: TextRange::new( - TextSize::from(u32::try_from(tab.start()).unwrap()), - TextSize::from(u32::try_from(tab.end()).unwrap()), - ), - }); - } else { - panic!("No match"); - }; - } - - tokens -} - -/// Turn a string of potentially valid sql code into a list of tokens, including their range in the source text. -/// -/// The implementation is primarily using libpg_querys `scan` method, and fills in the gaps with tokens that are not parsed by the library, e.g. whitespace. -pub fn lex(text: &str) -> Result, Vec> { - let mut whitespace_tokens = whitespace_tokens(text); - - // tokens from pg_query.rs - let mut pgt_query_tokens = match pg_query::scan(text) { - Ok(r) => r.tokens.into_iter().collect::>(), - Err(err) => return Err(ScanError::from_pg_query_err(err, text)), - }; - - // merge the two token lists - let mut tokens: Vec = Vec::new(); - let mut pos = TextSize::from(0); - - while pos < text.text_len() { - if !pgt_query_tokens.is_empty() - && TextSize::from(u32::try_from(pgt_query_tokens[0].start).unwrap()) == pos - { - let pgt_query_token = pgt_query_tokens.pop_front().unwrap(); - - // the lexer returns byte indices, so we need to slice - let token_text = &text[usize::try_from(pgt_query_token.start).unwrap() - ..usize::try_from(pgt_query_token.end).unwrap()]; - - let len = token_text.text_len(); - let has_whitespace = token_text.contains(" ") || token_text.contains("\n"); - tokens.push(Token { - token_type: TokenType::from(&pgt_query_token), - kind: SyntaxKind::from(&pgt_query_token), - text: token_text.to_string(), - span: TextRange::new( - TextSize::from(u32::try_from(pgt_query_token.start).unwrap()), - TextSize::from(u32::try_from(pgt_query_token.end).unwrap()), - ), - }); - pos += len; - - if has_whitespace { - while !whitespace_tokens.is_empty() - && whitespace_tokens[0].span.start() < TextSize::from(u32::from(pos)) - { - whitespace_tokens.pop_front(); - } - } - - continue; - } - - if !whitespace_tokens.is_empty() - && whitespace_tokens[0].span.start() == TextSize::from(u32::from(pos)) - { - let whitespace_token = whitespace_tokens.pop_front().unwrap(); - let len = whitespace_token.text.text_len(); - tokens.push(whitespace_token); - pos += len; - continue; - } - - let usize_pos = usize::from(pos); - panic!( - "No token found at position {:?}: '{:?}'", - pos, - text.get(usize_pos..usize_pos + 1) - ); - } - - Ok(tokens) +/// Lex the input string into tokens and diagnostics +pub fn lex(input: &str) -> Lexed { + Lexer::new(input).lex() } #[cfg(test)] @@ -193,101 +16,106 @@ mod tests { use super::*; #[test] - fn test_special_chars() { - let input = "insert into c (name, full_name) values ('Å', 1);"; - let tokens = lex(input).unwrap(); - assert!(!tokens.is_empty()); - } + fn test_basic_lexing() { + let input = "SELECT * FROM users WHERE id = 1;"; + let lexed = lex(input); + + // Check we have tokens + assert!(!lexed.is_empty()); + + // Iterate over tokens and collect identifiers + let mut identifiers = Vec::new(); + for (idx, kind) in lexed.tokens().enumerate() { + if kind == SyntaxKind::IDENT { + identifiers.push((lexed.text(idx), lexed.range(idx))); + } + } - #[test] - fn test_tab_tokens() { - let input = "select\t1"; - let tokens = lex(input).unwrap(); - assert_eq!(tokens[1].kind, SyntaxKind::Tab); + // Should find at least "users" and "id" as identifiers + assert!(identifiers.len() >= 2); } #[test] - fn test_newline_tokens() { - let input = "select\n1"; - let tokens = lex(input).unwrap(); - assert_eq!(tokens[1].kind, SyntaxKind::Newline); + fn test_lexing_with_errors() { + let input = "SELECT 'unterminated string"; + let lexed = lex(input); + + // Should have tokens + assert!(!lexed.is_empty()); + + // Should have an error for unterminated string + let errors = lexed.errors(); + assert!(!errors.is_empty()); + // Check the error message exists + assert!(!errors[0].message.to_string().is_empty()); } #[test] - fn test_consecutive_newlines() { - // Test with multiple consecutive newlines - #[cfg(windows)] - let input = "select\r\n\r\n1"; - #[cfg(not(windows))] - let input = "select\n\n1"; - - let tokens = lex(input).unwrap(); - - // Check that we have exactly one newline token between "select" and "1" - assert_eq!(tokens[0].kind, SyntaxKind::Select); - assert_eq!(tokens[1].kind, SyntaxKind::Newline); - assert_eq!(tokens[2].kind, SyntaxKind::Iconst); + fn test_token_ranges() { + let input = "SELECT id"; + let lexed = lex(input); + + // First token should be a keyword (SELECT gets parsed as a keyword) + let _first_kind = lexed.kind(0); + assert_eq!(u32::from(lexed.range(0).start()), 0); + assert_eq!(u32::from(lexed.range(0).end()), 6); + assert_eq!(lexed.text(0), "SELECT"); + + // Find the id token + for (idx, kind) in lexed.tokens().enumerate() { + if kind == SyntaxKind::IDENT && lexed.text(idx) == "id" { + assert_eq!(u32::from(lexed.range(idx).start()), 7); + assert_eq!(u32::from(lexed.range(idx).end()), 9); + } + } } #[test] - fn test_whitespace_tokens() { - let input = "select 1"; - let tokens = lex(input).unwrap(); - assert_eq!(tokens[1].kind, SyntaxKind::Whitespace); + fn test_empty_input() { + let input = ""; + let lexed = lex(input); + assert_eq!(lexed.len(), 1); + assert_eq!(lexed.kind(0), SyntaxKind::EOF); } #[test] - fn test_lexer() { - let input = "select 1; \n -- some comment \n select 2\t"; - - let tokens = lex(input).unwrap(); - let mut tokens_iter = tokens.iter(); - - let token = tokens_iter.next().unwrap(); - assert_eq!(token.kind, SyntaxKind::Select); - assert_eq!(token.text, "select"); - - let token = tokens_iter.next().unwrap(); - assert_eq!(token.kind, SyntaxKind::Whitespace); - - let token = tokens_iter.next().unwrap(); - assert_eq!(token.kind, SyntaxKind::Iconst); - assert_eq!(token.text, "1"); - - let token = tokens_iter.next().unwrap(); - assert_eq!(token.kind, SyntaxKind::Ascii59); - - let token = tokens_iter.next().unwrap(); - assert_eq!(token.kind, SyntaxKind::Whitespace); - - let token = tokens_iter.next().unwrap(); - assert_eq!(token.kind, SyntaxKind::Newline); - - let token = tokens_iter.next().unwrap(); - assert_eq!(token.kind, SyntaxKind::Whitespace); - - let token = tokens_iter.next().unwrap(); - assert_eq!(token.kind, SyntaxKind::SqlComment); - assert_eq!(token.text, "-- some comment "); - - let token = tokens_iter.next().unwrap(); - assert_eq!(token.kind, SyntaxKind::Newline); - - let token = tokens_iter.next().unwrap(); - assert_eq!(token.kind, SyntaxKind::Whitespace); - - let token = tokens_iter.next().unwrap(); - assert_eq!(token.kind, SyntaxKind::Select); - assert_eq!(token.text, "select"); - - let token = tokens_iter.next().unwrap(); - assert_eq!(token.kind, SyntaxKind::Whitespace); + fn test_whitespace_handling() { + let input = " SELECT \n id "; + let lexed = lex(input); + + // Collect non-whitespace tokens + let mut non_whitespace = Vec::new(); + for (idx, kind) in lexed.tokens().enumerate() { + if !matches!( + kind, + SyntaxKind::SPACE | SyntaxKind::TAB | SyntaxKind::LINE_ENDING | SyntaxKind::EOF + ) { + non_whitespace.push(lexed.text(idx)); + } + } - let token = tokens_iter.next().unwrap(); - assert_eq!(token.kind, SyntaxKind::Iconst); - assert_eq!(token.text, "2"); + assert_eq!(non_whitespace.len(), 2); // SELECT and id + } - let token = tokens_iter.next().unwrap(); - assert_eq!(token.kind, SyntaxKind::Tab); + #[test] + fn finds_lex_errors() { + // Test with unterminated block comment + let input = "/* unterminated comment"; + let lexed = lex(input); + let errors = lexed.errors(); + + // Should have error for unterminated block comment + assert!(!errors.is_empty()); + assert!(errors[0].message.to_string().contains("Missing trailing")); + assert!(errors[0].span.start() < errors[0].span.end()); + + // Test with unterminated string + let input2 = "SELECT 'unterminated string"; + let lexed2 = lex(input2); + let errors2 = lexed2.errors(); + + // Should have error for unterminated string + assert!(!errors2.is_empty()); + assert!(errors2[0].message.to_string().contains("Missing trailing")); } } diff --git a/crates/pgt_lexer_codegen/Cargo.toml b/crates/pgt_lexer_codegen/Cargo.toml index c5878646..b50465b0 100644 --- a/crates/pgt_lexer_codegen/Cargo.toml +++ b/crates/pgt_lexer_codegen/Cargo.toml @@ -10,12 +10,16 @@ name = "pgt_lexer_codegen" repository.workspace = true version = "0.0.0" - [dependencies] -pgt_query_proto_parser.workspace = true -proc-macro2.workspace = true -quote = "1.0.33" +anyhow = { workspace = true } +convert_case = { workspace = true } +proc-macro2.workspace = true +prost-reflect = { workspace = true } +protox = { workspace = true } +quote.workspace = true + +[build-dependencies] +ureq = "2.9" [lib] -doctest = false proc-macro = true diff --git a/crates/pgt_lexer_codegen/README.md b/crates/pgt_lexer_codegen/README.md index 843ac2f8..57bdaa34 100644 --- a/crates/pgt_lexer_codegen/README.md +++ b/crates/pgt_lexer_codegen/README.md @@ -1,7 +1 @@ -# pgt_lexer_codegen - -This crate is responsible for reading `libpg_query`'s protobuf file and turning it into the Rust enum `SyntaxKind`. - -It does so by reading the file from the installed git submodule, parsing it with a protobuf parser, and using a procedural macro to generate the enum. - -Rust requires procedural macros to be defined in a different crate than where they're used, hence this \_codegen crate. +Heavily inspired by and copied from [squawk_parser](https://github.com/sbdchd/squawk/tree/9acfecbbb7f3c7eedcbaf060e7b25f9afa136db3/crates/squawk_parser). Thanks for making all the hard work MIT-licensed! diff --git a/crates/pgt_lexer_codegen/build.rs b/crates/pgt_lexer_codegen/build.rs new file mode 100644 index 00000000..70c9635d --- /dev/null +++ b/crates/pgt_lexer_codegen/build.rs @@ -0,0 +1,49 @@ +use std::env; +use std::fs; +use std::io::Write; +use std::path::PathBuf; + +// TODO make this selectable via feature flags +static LIBPG_QUERY_TAG: &str = "17-6.1.0"; + +/// Downloads the `kwlist.h` file from the specified version of `libpg_query` +fn main() -> Result<(), Box> { + let version = LIBPG_QUERY_TAG.to_string(); + + // Check for the postgres header file in the source tree first + let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR")?); + let headers_dir = manifest_dir.join("postgres").join(&version); + let kwlist_path = headers_dir.join("kwlist.h"); + + // Only download if the file doesn't exist + if !kwlist_path.exists() { + println!( + "cargo:warning=Downloading kwlist.h for libpg_query {}", + version + ); + + fs::create_dir_all(&headers_dir)?; + + let proto_url = format!( + "https://raw.githubusercontent.com/pganalyze/libpg_query/{}/src/postgres/include/parser/kwlist.h", + version + ); + + let response = ureq::get(&proto_url).call()?; + let content = response.into_string()?; + + let mut file = fs::File::create(&kwlist_path)?; + file.write_all(content.as_bytes())?; + + println!("cargo:warning=Successfully downloaded kwlist.h"); + } + + println!( + "cargo:rustc-env=PG_QUERY_KWLIST_PATH={}", + kwlist_path.display() + ); + + println!("cargo:rerun-if-changed={}", kwlist_path.display()); + + Ok(()) +} diff --git a/crates/pgt_lexer_codegen/postgres/17-6.1.0/kwlist.h b/crates/pgt_lexer_codegen/postgres/17-6.1.0/kwlist.h new file mode 100644 index 00000000..658d7ff6 --- /dev/null +++ b/crates/pgt_lexer_codegen/postgres/17-6.1.0/kwlist.h @@ -0,0 +1,518 @@ +/*------------------------------------------------------------------------- + * + * kwlist.h + * + * The keyword lists are kept in their own source files for use by + * automatic tools. The exact representation of a keyword is determined + * by the PG_KEYWORD macro, which is not defined in this file; it can + * be defined by the caller for special purposes. + * + * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/include/parser/kwlist.h + * + *------------------------------------------------------------------------- + */ + +/* there is deliberately not an #ifndef KWLIST_H here */ + +/* + * List of keyword (name, token-value, category, bare-label-status) entries. + * + * Note: gen_keywordlist.pl requires the entries to appear in ASCII order. + */ + +/* name, value, category, is-bare-label */ +PG_KEYWORD("abort", ABORT_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("absent", ABSENT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("absolute", ABSOLUTE_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("access", ACCESS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("action", ACTION, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("add", ADD_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("admin", ADMIN, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("after", AFTER, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("aggregate", AGGREGATE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("all", ALL, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("also", ALSO, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("alter", ALTER, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("always", ALWAYS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("analyse", ANALYSE, RESERVED_KEYWORD, BARE_LABEL) /* British spelling */ +PG_KEYWORD("analyze", ANALYZE, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("and", AND, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("any", ANY, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("array", ARRAY, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("as", AS, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("asc", ASC, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("asensitive", ASENSITIVE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("assertion", ASSERTION, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("assignment", ASSIGNMENT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("asymmetric", ASYMMETRIC, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("at", AT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("atomic", ATOMIC, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("attach", ATTACH, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("attribute", ATTRIBUTE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("authorization", AUTHORIZATION, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("backward", BACKWARD, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("before", BEFORE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("begin", BEGIN_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("between", BETWEEN, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("bigint", BIGINT, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("binary", BINARY, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("bit", BIT, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("boolean", BOOLEAN_P, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("both", BOTH, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("breadth", BREADTH, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("by", BY, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("cache", CACHE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("call", CALL, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("called", CALLED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("cascade", CASCADE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("cascaded", CASCADED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("case", CASE, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("cast", CAST, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("catalog", CATALOG_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("chain", CHAIN, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("char", CHAR_P, COL_NAME_KEYWORD, AS_LABEL) +PG_KEYWORD("character", CHARACTER, COL_NAME_KEYWORD, AS_LABEL) +PG_KEYWORD("characteristics", CHARACTERISTICS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("check", CHECK, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("checkpoint", CHECKPOINT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("class", CLASS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("close", CLOSE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("cluster", CLUSTER, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("coalesce", COALESCE, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("collate", COLLATE, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("collation", COLLATION, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("column", COLUMN, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("columns", COLUMNS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("comment", COMMENT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("comments", COMMENTS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("commit", COMMIT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("committed", COMMITTED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("compression", COMPRESSION, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("concurrently", CONCURRENTLY, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("conditional", CONDITIONAL, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("configuration", CONFIGURATION, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("conflict", CONFLICT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("connection", CONNECTION, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("constraint", CONSTRAINT, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("constraints", CONSTRAINTS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("content", CONTENT_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("continue", CONTINUE_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("conversion", CONVERSION_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("copy", COPY, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("cost", COST, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("create", CREATE, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("cross", CROSS, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("csv", CSV, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("cube", CUBE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("current", CURRENT_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("current_catalog", CURRENT_CATALOG, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("current_date", CURRENT_DATE, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("current_role", CURRENT_ROLE, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("current_schema", CURRENT_SCHEMA, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("current_time", CURRENT_TIME, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("current_timestamp", CURRENT_TIMESTAMP, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("current_user", CURRENT_USER, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("cursor", CURSOR, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("cycle", CYCLE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("data", DATA_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("database", DATABASE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("day", DAY_P, UNRESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("deallocate", DEALLOCATE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("dec", DEC, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("decimal", DECIMAL_P, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("declare", DECLARE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("default", DEFAULT, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("defaults", DEFAULTS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("deferrable", DEFERRABLE, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("deferred", DEFERRED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("definer", DEFINER, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("delete", DELETE_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("delimiter", DELIMITER, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("delimiters", DELIMITERS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("depends", DEPENDS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("depth", DEPTH, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("desc", DESC, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("detach", DETACH, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("dictionary", DICTIONARY, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("disable", DISABLE_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("discard", DISCARD, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("distinct", DISTINCT, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("do", DO, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("document", DOCUMENT_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("domain", DOMAIN_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("double", DOUBLE_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("drop", DROP, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("each", EACH, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("else", ELSE, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("empty", EMPTY_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("enable", ENABLE_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("encoding", ENCODING, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("encrypted", ENCRYPTED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("end", END_P, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("enum", ENUM_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("error", ERROR_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("escape", ESCAPE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("event", EVENT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("except", EXCEPT, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("exclude", EXCLUDE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("excluding", EXCLUDING, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("exclusive", EXCLUSIVE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("execute", EXECUTE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("exists", EXISTS, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("explain", EXPLAIN, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("expression", EXPRESSION, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("extension", EXTENSION, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("external", EXTERNAL, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("extract", EXTRACT, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("false", FALSE_P, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("family", FAMILY, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("fetch", FETCH, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("filter", FILTER, UNRESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("finalize", FINALIZE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("first", FIRST_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("float", FLOAT_P, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("following", FOLLOWING, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("for", FOR, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("force", FORCE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("foreign", FOREIGN, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("format", FORMAT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("forward", FORWARD, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("freeze", FREEZE, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("from", FROM, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("full", FULL, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("function", FUNCTION, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("functions", FUNCTIONS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("generated", GENERATED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("global", GLOBAL, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("grant", GRANT, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("granted", GRANTED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("greatest", GREATEST, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("group", GROUP_P, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("grouping", GROUPING, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("groups", GROUPS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("handler", HANDLER, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("having", HAVING, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("header", HEADER_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("hold", HOLD, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("hour", HOUR_P, UNRESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("identity", IDENTITY_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("if", IF_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("ilike", ILIKE, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("immediate", IMMEDIATE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("immutable", IMMUTABLE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("implicit", IMPLICIT_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("import", IMPORT_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("in", IN_P, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("include", INCLUDE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("including", INCLUDING, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("increment", INCREMENT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("indent", INDENT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("index", INDEX, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("indexes", INDEXES, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("inherit", INHERIT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("inherits", INHERITS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("initially", INITIALLY, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("inline", INLINE_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("inner", INNER_P, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("inout", INOUT, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("input", INPUT_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("insensitive", INSENSITIVE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("insert", INSERT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("instead", INSTEAD, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("int", INT_P, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("integer", INTEGER, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("intersect", INTERSECT, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("interval", INTERVAL, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("into", INTO, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("invoker", INVOKER, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("is", IS, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("isnull", ISNULL, TYPE_FUNC_NAME_KEYWORD, AS_LABEL) +PG_KEYWORD("isolation", ISOLATION, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("join", JOIN, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("json", JSON, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("json_array", JSON_ARRAY, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("json_arrayagg", JSON_ARRAYAGG, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("json_exists", JSON_EXISTS, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("json_object", JSON_OBJECT, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("json_objectagg", JSON_OBJECTAGG, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("json_query", JSON_QUERY, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("json_scalar", JSON_SCALAR, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("json_serialize", JSON_SERIALIZE, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("json_table", JSON_TABLE, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("json_value", JSON_VALUE, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("keep", KEEP, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("key", KEY, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("keys", KEYS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("label", LABEL, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("language", LANGUAGE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("large", LARGE_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("last", LAST_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("lateral", LATERAL_P, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("leading", LEADING, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("leakproof", LEAKPROOF, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("least", LEAST, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("left", LEFT, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("level", LEVEL, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("like", LIKE, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("limit", LIMIT, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("listen", LISTEN, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("load", LOAD, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("local", LOCAL, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("localtime", LOCALTIME, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("localtimestamp", LOCALTIMESTAMP, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("location", LOCATION, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("lock", LOCK_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("locked", LOCKED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("logged", LOGGED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("mapping", MAPPING, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("match", MATCH, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("matched", MATCHED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("materialized", MATERIALIZED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("maxvalue", MAXVALUE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("merge", MERGE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("merge_action", MERGE_ACTION, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("method", METHOD, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("minute", MINUTE_P, UNRESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("minvalue", MINVALUE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("mode", MODE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("month", MONTH_P, UNRESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("move", MOVE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("name", NAME_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("names", NAMES, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("national", NATIONAL, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("natural", NATURAL, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("nchar", NCHAR, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("nested", NESTED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("new", NEW, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("next", NEXT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("nfc", NFC, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("nfd", NFD, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("nfkc", NFKC, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("nfkd", NFKD, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("no", NO, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("none", NONE, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("normalize", NORMALIZE, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("normalized", NORMALIZED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("not", NOT, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("nothing", NOTHING, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("notify", NOTIFY, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("notnull", NOTNULL, TYPE_FUNC_NAME_KEYWORD, AS_LABEL) +PG_KEYWORD("nowait", NOWAIT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("null", NULL_P, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("nullif", NULLIF, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("nulls", NULLS_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("numeric", NUMERIC, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("object", OBJECT_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("of", OF, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("off", OFF, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("offset", OFFSET, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("oids", OIDS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("old", OLD, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("omit", OMIT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("on", ON, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("only", ONLY, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("operator", OPERATOR, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("option", OPTION, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("options", OPTIONS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("or", OR, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("order", ORDER, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("ordinality", ORDINALITY, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("others", OTHERS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("out", OUT_P, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("outer", OUTER_P, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("over", OVER, UNRESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("overlaps", OVERLAPS, TYPE_FUNC_NAME_KEYWORD, AS_LABEL) +PG_KEYWORD("overlay", OVERLAY, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("overriding", OVERRIDING, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("owned", OWNED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("owner", OWNER, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("parallel", PARALLEL, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("parameter", PARAMETER, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("parser", PARSER, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("partial", PARTIAL, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("partition", PARTITION, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("passing", PASSING, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("password", PASSWORD, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("path", PATH, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("placing", PLACING, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("plan", PLAN, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("plans", PLANS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("policy", POLICY, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("position", POSITION, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("preceding", PRECEDING, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("precision", PRECISION, COL_NAME_KEYWORD, AS_LABEL) +PG_KEYWORD("prepare", PREPARE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("prepared", PREPARED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("preserve", PRESERVE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("primary", PRIMARY, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("prior", PRIOR, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("privileges", PRIVILEGES, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("procedural", PROCEDURAL, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("procedure", PROCEDURE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("procedures", PROCEDURES, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("program", PROGRAM, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("publication", PUBLICATION, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("quote", QUOTE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("quotes", QUOTES, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("range", RANGE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("read", READ, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("real", REAL, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("reassign", REASSIGN, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("recheck", RECHECK, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("recursive", RECURSIVE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("ref", REF_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("references", REFERENCES, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("referencing", REFERENCING, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("refresh", REFRESH, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("reindex", REINDEX, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("relative", RELATIVE_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("release", RELEASE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("rename", RENAME, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("repeatable", REPEATABLE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("replace", REPLACE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("replica", REPLICA, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("reset", RESET, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("restart", RESTART, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("restrict", RESTRICT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("return", RETURN, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("returning", RETURNING, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("returns", RETURNS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("revoke", REVOKE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("right", RIGHT, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("role", ROLE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("rollback", ROLLBACK, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("rollup", ROLLUP, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("routine", ROUTINE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("routines", ROUTINES, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("row", ROW, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("rows", ROWS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("rule", RULE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("savepoint", SAVEPOINT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("scalar", SCALAR, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("schema", SCHEMA, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("schemas", SCHEMAS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("scroll", SCROLL, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("search", SEARCH, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("second", SECOND_P, UNRESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("security", SECURITY, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("select", SELECT, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("sequence", SEQUENCE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("sequences", SEQUENCES, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("serializable", SERIALIZABLE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("server", SERVER, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("session", SESSION, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("session_user", SESSION_USER, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("set", SET, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("setof", SETOF, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("sets", SETS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("share", SHARE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("show", SHOW, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("similar", SIMILAR, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("simple", SIMPLE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("skip", SKIP, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("smallint", SMALLINT, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("snapshot", SNAPSHOT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("some", SOME, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("source", SOURCE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("sql", SQL_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("stable", STABLE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("standalone", STANDALONE_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("start", START, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("statement", STATEMENT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("statistics", STATISTICS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("stdin", STDIN, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("stdout", STDOUT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("storage", STORAGE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("stored", STORED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("strict", STRICT_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("string", STRING_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("strip", STRIP_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("subscription", SUBSCRIPTION, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("substring", SUBSTRING, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("support", SUPPORT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("symmetric", SYMMETRIC, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("sysid", SYSID, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("system", SYSTEM_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("system_user", SYSTEM_USER, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("table", TABLE, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("tables", TABLES, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("tablesample", TABLESAMPLE, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("tablespace", TABLESPACE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("target", TARGET, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("temp", TEMP, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("template", TEMPLATE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("temporary", TEMPORARY, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("text", TEXT_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("then", THEN, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("ties", TIES, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("time", TIME, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("timestamp", TIMESTAMP, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("to", TO, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("trailing", TRAILING, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("transaction", TRANSACTION, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("transform", TRANSFORM, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("treat", TREAT, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("trigger", TRIGGER, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("trim", TRIM, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("true", TRUE_P, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("truncate", TRUNCATE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("trusted", TRUSTED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("type", TYPE_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("types", TYPES_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("uescape", UESCAPE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("unbounded", UNBOUNDED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("uncommitted", UNCOMMITTED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("unconditional", UNCONDITIONAL, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("unencrypted", UNENCRYPTED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("union", UNION, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("unique", UNIQUE, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("unknown", UNKNOWN, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("unlisten", UNLISTEN, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("unlogged", UNLOGGED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("until", UNTIL, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("update", UPDATE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("user", USER, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("using", USING, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("vacuum", VACUUM, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("valid", VALID, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("validate", VALIDATE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("validator", VALIDATOR, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("value", VALUE_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("values", VALUES, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("varchar", VARCHAR, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("variadic", VARIADIC, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("varying", VARYING, UNRESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("verbose", VERBOSE, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("version", VERSION_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("view", VIEW, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("views", VIEWS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("volatile", VOLATILE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("when", WHEN, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("where", WHERE, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("whitespace", WHITESPACE_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("window", WINDOW, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("with", WITH, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("within", WITHIN, UNRESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("without", WITHOUT, UNRESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("work", WORK, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("wrapper", WRAPPER, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("write", WRITE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("xml", XML_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("xmlattributes", XMLATTRIBUTES, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("xmlconcat", XMLCONCAT, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("xmlelement", XMLELEMENT, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("xmlexists", XMLEXISTS, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("xmlforest", XMLFOREST, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("xmlnamespaces", XMLNAMESPACES, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("xmlparse", XMLPARSE, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("xmlpi", XMLPI, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("xmlroot", XMLROOT, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("xmlserialize", XMLSERIALIZE, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("xmltable", XMLTABLE, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("year", YEAR_P, UNRESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("yes", YES_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("zone", ZONE, UNRESERVED_KEYWORD, BARE_LABEL) diff --git a/crates/pgt_lexer_codegen/src/keywords.rs b/crates/pgt_lexer_codegen/src/keywords.rs new file mode 100644 index 00000000..f0104c8d --- /dev/null +++ b/crates/pgt_lexer_codegen/src/keywords.rs @@ -0,0 +1,43 @@ +// from https://github.com/sbdchd/squawk/blob/ac9f90c3b2be8d2c46fd5454eb48975afd268dbe/crates/xtask/src/keywords.rs +use anyhow::{Context, Ok, Result}; +use std::path; + +fn parse_header() -> Result> { + // use the environment variable set by the build script to locate the kwlist.h file + let kwlist_file = path::PathBuf::from(env!("PG_QUERY_KWLIST_PATH")); + let data = std::fs::read_to_string(kwlist_file).context("Failed to read kwlist.h")?; + + let mut keywords = Vec::new(); + + for line in data.lines() { + if line.starts_with("PG_KEYWORD") { + let line = line + .split(&['(', ')']) + .nth(1) + .context("Invalid kwlist.h structure")?; + + let row_items: Vec<&str> = line.split(',').collect(); + + match row_items[..] { + [name, _value, _category, _is_bare_label] => { + let name = name.trim().replace('\"', ""); + keywords.push(name); + } + _ => anyhow::bail!("Problem reading kwlist.h row"), + } + } + } + + Ok(keywords) +} + +pub(crate) struct KeywordKinds { + pub(crate) all_keywords: Vec, +} + +pub(crate) fn keyword_kinds() -> Result { + let mut all_keywords = parse_header()?; + all_keywords.sort(); + + Ok(KeywordKinds { all_keywords }) +} diff --git a/crates/pgt_lexer_codegen/src/lib.rs b/crates/pgt_lexer_codegen/src/lib.rs index 8f492e4b..b620b6a6 100644 --- a/crates/pgt_lexer_codegen/src/lib.rs +++ b/crates/pgt_lexer_codegen/src/lib.rs @@ -1,29 +1,9 @@ +mod keywords; mod syntax_kind; -use pgt_query_proto_parser::ProtoParser; -use quote::quote; -use std::{env, path, path::Path}; +use syntax_kind::syntax_kind_mod; #[proc_macro] -pub fn lexer_codegen(_item: proc_macro::TokenStream) -> proc_macro::TokenStream { - let parser = ProtoParser::new(&proto_file_path()); - let proto_file = parser.parse(); - - let syntax_kind = syntax_kind::syntax_kind_mod(&proto_file); - - quote! { - use pg_query::{protobuf, protobuf::ScanToken, protobuf::Token, NodeEnum, NodeRef}; - - #syntax_kind - } - .into() -} - -fn proto_file_path() -> path::PathBuf { - Path::new(env!("CARGO_MANIFEST_DIR")) - .ancestors() - .nth(2) - .unwrap() - .join("libpg_query/protobuf/pg_query.proto") - .to_path_buf() +pub fn syntax_kind_codegen(_input: proc_macro::TokenStream) -> proc_macro::TokenStream { + syntax_kind_mod().into() } diff --git a/crates/pgt_lexer_codegen/src/syntax_kind.rs b/crates/pgt_lexer_codegen/src/syntax_kind.rs index 091b1e02..07b7a419 100644 --- a/crates/pgt_lexer_codegen/src/syntax_kind.rs +++ b/crates/pgt_lexer_codegen/src/syntax_kind.rs @@ -1,111 +1,121 @@ -use std::collections::HashSet; - -use pgt_query_proto_parser::{Node, ProtoFile, Token}; -use proc_macro2::{Ident, Literal}; +use convert_case::{Case, Casing}; +use proc_macro2::TokenStream; use quote::{format_ident, quote}; -pub fn syntax_kind_mod(proto_file: &ProtoFile) -> proc_macro2::TokenStream { - let custom_node_names = custom_node_names(); - let custom_node_identifiers = custom_node_identifiers(&custom_node_names); - - let node_identifiers = node_identifiers(&proto_file.nodes); +use crate::keywords::{KeywordKinds, keyword_kinds}; + +const WHITESPACE: &[&str] = &[ + "SPACE", // " " + "TAB", // "\t" + "VERTICAL_TAB", // "\x0B" + "FORM_FEED", // "\x0C" + "LINE_ENDING", // "\n" or "\r" in any combination +]; + +const PUNCT: &[(&str, &str)] = &[ + ("$", "DOLLAR"), + (";", "SEMICOLON"), + (",", "COMMA"), + ("(", "L_PAREN"), + (")", "R_PAREN"), + ("[", "L_BRACK"), + ("]", "R_BRACK"), + ("<", "L_ANGLE"), + (">", "R_ANGLE"), + ("@", "AT"), + ("#", "POUND"), + ("~", "TILDE"), + ("?", "QUESTION"), + ("&", "AMP"), + ("|", "PIPE"), + ("+", "PLUS"), + ("*", "STAR"), + ("/", "SLASH"), + ("\\", "BACKSLASH"), + ("^", "CARET"), + ("%", "PERCENT"), + ("_", "UNDERSCORE"), + (".", "DOT"), + (":", "COLON"), + ("=", "EQ"), + ("!", "BANG"), + ("-", "MINUS"), + ("`", "BACKTICK"), +]; + +const EXTRA: &[&str] = &["POSITIONAL_PARAM", "ERROR", "COMMENT", "EOF"]; + +const LITERALS: &[&str] = &[ + "BIT_STRING", + "BYTE_STRING", + "DOLLAR_QUOTED_STRING", + "ESC_STRING", + "FLOAT_NUMBER", + "INT_NUMBER", + "NULL", + "STRING", + "IDENT", +]; + +pub fn syntax_kind_mod() -> proc_macro2::TokenStream { + let keywords = keyword_kinds().expect("Failed to get keyword kinds"); + + let KeywordKinds { all_keywords, .. } = keywords; + + let mut enum_variants: Vec = Vec::new(); + let mut from_kw_match_arms: Vec = Vec::new(); + + // collect keywords + for kw in &all_keywords { + if kw.to_uppercase().contains("WHITESPACE") { + continue; // Skip whitespace as it is handled separately + } - let token_identifiers = token_identifiers(&proto_file.tokens); - let token_value_literals = token_value_literals(&proto_file.tokens); + let kind_ident = format_ident!("{}_KW", kw.to_case(Case::UpperSnake)); - let syntax_kind_from_impl = - syntax_kind_from_impl(&node_identifiers, &token_identifiers, &token_value_literals); + enum_variants.push(quote! { #kind_ident }); + from_kw_match_arms.push(quote! { + #kw => Some(SyntaxKind::#kind_ident) + }); + } - let mut enum_variants = HashSet::new(); - enum_variants.extend(&custom_node_identifiers); - enum_variants.extend(&node_identifiers); - enum_variants.extend(&token_identifiers); - let unique_enum_variants = enum_variants.into_iter().collect::>(); + // collect extra keywords + EXTRA.iter().for_each(|&name| { + let variant_name = format_ident!("{}", name); + enum_variants.push(quote! { #variant_name }); + }); + + // collect whitespace variants + WHITESPACE.iter().for_each(|&name| { + let variant_name = format_ident!("{}", name); + enum_variants.push(quote! { #variant_name }); + }); + + // collect punctuations + PUNCT.iter().for_each(|&(_ascii_name, variant)| { + let variant_name = format_ident!("{}", variant); + enum_variants.push(quote! { #variant_name }); + }); + + // collect literals + LITERALS.iter().for_each(|&name| { + let variant_name = format_ident!("{}", name); + enum_variants.push(quote! { #variant_name }); + }); quote! { - /// An u32 enum of all valid syntax elements (nodes and tokens) of the postgres - /// sql dialect, and a few custom ones that are not parsed by pg_query.rs, such - /// as `Whitespace`. #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)] - #[repr(u32)] + #[repr(u16)] pub enum SyntaxKind { - #(#unique_enum_variants),*, - } - - #syntax_kind_from_impl - } -} - -fn custom_node_names() -> Vec<&'static str> { - vec![ - "SourceFile", - "Comment", - "Whitespace", - "Newline", - "Tab", - "Stmt", - "Eof", - ] -} - -fn custom_node_identifiers(custom_node_names: &[&str]) -> Vec { - custom_node_names - .iter() - .map(|&node_name| format_ident!("{}", node_name)) - .collect() -} - -fn node_identifiers(nodes: &[Node]) -> Vec { - nodes - .iter() - .map(|node| format_ident!("{}", &node.name)) - .collect() -} - -fn token_identifiers(tokens: &[Token]) -> Vec { - tokens - .iter() - .map(|token| format_ident!("{}", &token.name)) - .collect() -} - -fn token_value_literals(tokens: &[Token]) -> Vec { - tokens - .iter() - .map(|token| Literal::i32_unsuffixed(token.value)) - .collect() -} - -fn syntax_kind_from_impl( - node_identifiers: &[Ident], - token_identifiers: &[Ident], - token_value_literals: &[Literal], -) -> proc_macro2::TokenStream { - quote! { - /// Converts a `pg_query` node to a `SyntaxKind` - impl From<&NodeEnum> for SyntaxKind { - fn from(node: &NodeEnum) -> SyntaxKind { - match node { - #(NodeEnum::#node_identifiers(_) => SyntaxKind::#node_identifiers),* - } - } - - } - - impl From for SyntaxKind { - fn from(token: Token) -> SyntaxKind { - match i32::from(token) { - #(#token_value_literals => SyntaxKind::#token_identifiers),*, - _ => panic!("Unknown token: {:?}", token), - } - } + #(#enum_variants),*, } - impl From<&ScanToken> for SyntaxKind { - fn from(token: &ScanToken) -> SyntaxKind { - match token.token { - #(#token_value_literals => SyntaxKind::#token_identifiers),*, - _ => panic!("Unknown token: {:?}", token.token), + impl SyntaxKind { + pub(crate) fn from_keyword(ident: &str) -> Option { + let lower_ident = ident.to_ascii_lowercase(); + match lower_ident.as_str() { + #(#from_kw_match_arms),*, + _ => None } } } diff --git a/crates/pgt_query_ext/Cargo.toml b/crates/pgt_query_ext/Cargo.toml index c6754b67..3e6b57c1 100644 --- a/crates/pgt_query_ext/Cargo.toml +++ b/crates/pgt_query_ext/Cargo.toml @@ -14,11 +14,9 @@ version = "0.0.0" [dependencies] petgraph = "0.6.4" -pg_query.workspace = true -pgt_diagnostics.workspace = true -pgt_lexer.workspace = true -pgt_query_ext_codegen.workspace = true -pgt_text_size.workspace = true +pg_query.workspace = true +pgt_diagnostics.workspace = true +pgt_text_size.workspace = true [lib] doctest = false diff --git a/crates/pgt_query_ext/src/codegen.rs b/crates/pgt_query_ext/src/codegen.rs deleted file mode 100644 index 8278383b..00000000 --- a/crates/pgt_query_ext/src/codegen.rs +++ /dev/null @@ -1 +0,0 @@ -pgt_query_ext_codegen::codegen!(); diff --git a/crates/pgt_query_ext/src/lib.rs b/crates/pgt_query_ext/src/lib.rs index c1f5fb49..a087ec60 100644 --- a/crates/pgt_query_ext/src/lib.rs +++ b/crates/pgt_query_ext/src/lib.rs @@ -9,16 +9,11 @@ //! - `get_node_properties` to get the properties of a node //! - `get_nodes` to get all the nodes in the AST as a petgraph tree //! - `ChildrenIterator` to iterate over the children of a node -mod codegen; pub mod diagnostics; pub use pg_query::protobuf; pub use pg_query::{Error, NodeEnum, Result}; -pub use codegen::{ - ChildrenIterator, Node, TokenProperty, get_location, get_node_properties, get_nodes, -}; - pub fn parse(sql: &str) -> Result { pg_query::parse(sql).map(|parsed| { parsed diff --git a/crates/pgt_query_ext_codegen/src/get_location.rs b/crates/pgt_query_ext_codegen/src/get_location.rs deleted file mode 100644 index fa6fa8b2..00000000 --- a/crates/pgt_query_ext_codegen/src/get_location.rs +++ /dev/null @@ -1,122 +0,0 @@ -use pgt_query_proto_parser::{FieldType, Node, ProtoFile}; -use proc_macro2::{Ident, TokenStream}; -use quote::{format_ident, quote}; - -pub fn get_location_mod(proto_file: &ProtoFile) -> proc_macro2::TokenStream { - let manual_node_names = manual_node_names(); - - let node_identifiers = node_identifiers(&proto_file.nodes, &manual_node_names); - let location_idents = location_idents(&proto_file.nodes, &manual_node_names); - - quote! { - /// Returns the location of a node - pub fn get_location(node: &NodeEnum) -> Option { - let loc = get_location_internal(node); - if loc.is_some() { - usize::try_from(loc.unwrap()).ok() - } else { - None - } - } - - fn get_location_internal(node: &NodeEnum) -> Option { - let location = match node { - // for some nodes, the location of the node itself is after their children location. - // we implement the logic for those nodes manually. - // if you add one, make sure to add its name to `manual_node_names()`. - NodeEnum::BoolExpr(n) => { - let a = n.args.iter().min_by(|a, b| { - let loc_a = get_location_internal(&a.node.as_ref().unwrap()); - let loc_b = get_location_internal(&b.node.as_ref().unwrap()); - loc_a.cmp(&loc_b) - }); - get_location_internal(&a.unwrap().node.as_ref().unwrap()) - }, - NodeEnum::AExpr(n) => get_location_internal(&n.lexpr.as_ref().unwrap().node.as_ref().unwrap()), - NodeEnum::WindowDef(n) => { - if n.partition_clause.len() > 0 || n.order_clause.len() > 0 { - // the location is not correct if its the definition clause, e.g. for - // window w as (partition by a order by b) - // the location is the start of the `partition` token - None - } else { - Some(n.location) - } - }, - NodeEnum::CollateClause(n) => get_location_internal(&n.arg.as_ref().unwrap().node.as_ref().unwrap()), - NodeEnum::TypeCast(n) => get_location_internal(&n.arg.as_ref().unwrap().node.as_ref().unwrap()), - NodeEnum::ColumnDef(n) => if n.colname.len() > 0 { - Some(n.location) - } else { - None - }, - NodeEnum::NullTest(n) => if n.arg.is_some() { - get_location_internal(&n.arg.as_ref().unwrap().node.as_ref().unwrap()) - } else { - Some(n.location) - }, - NodeEnum::PublicationObjSpec(n) => { - match &n.pubtable { - Some(pubtable) => match &pubtable.relation { - Some(range_var) => Some(range_var.location), - None => Some(n.location), - }, - None => Some(n.location), - } - }, - NodeEnum::BooleanTest(n) => { - if n.arg.is_some() { - get_location_internal(&n.arg.as_ref().unwrap().node.as_ref().unwrap()) - } else { - Some(n.location) - } - }, - #(NodeEnum::#node_identifiers(n) => #location_idents),* - }; - if location.is_some() && location.unwrap() < 0 { - None - } else { - location - } - } - } -} - -fn manual_node_names() -> Vec<&'static str> { - vec![ - "BoolExpr", - "AExpr", - "WindowDef", - "CollateClause", - "TypeCast", - "ColumnDef", - "NullTest", - "PublicationObjSpec", - ] -} - -fn location_idents(nodes: &[Node], exclude_nodes: &[&str]) -> Vec { - nodes - .iter() - .filter(|n| !exclude_nodes.contains(&n.name.as_str())) - .map(|node| { - if node - .fields - .iter() - .any(|n| n.name == "location" && n.field_type == FieldType::Int32) - { - quote! { Some(n.location) } - } else { - quote! { None } - } - }) - .collect() -} - -fn node_identifiers(nodes: &[Node], exclude_nodes: &[&str]) -> Vec { - nodes - .iter() - .filter(|n| !exclude_nodes.contains(&n.name.as_str())) - .map(|node| format_ident!("{}", &node.name)) - .collect() -} diff --git a/crates/pgt_query_ext_codegen/src/get_node_properties.rs b/crates/pgt_query_ext_codegen/src/get_node_properties.rs deleted file mode 100644 index 9581304b..00000000 --- a/crates/pgt_query_ext_codegen/src/get_node_properties.rs +++ /dev/null @@ -1,1006 +0,0 @@ -use pgt_query_proto_parser::{FieldType, Node, ProtoFile}; -use proc_macro2::{Ident, TokenStream}; -use quote::{format_ident, quote}; - -pub fn get_node_properties_mod(proto_file: &ProtoFile) -> proc_macro2::TokenStream { - let node_identifiers = node_identifiers(&proto_file.nodes); - let node_handlers = node_handlers(&proto_file.nodes); - - quote! { - #[derive(Debug, Clone, PartialEq)] - pub struct TokenProperty { - pub value: Option, - pub kind: Option, - } - - impl TokenProperty { - pub fn new(value: Option, kind: Option) -> TokenProperty { - if value.is_none() && kind.is_none() { - panic!("TokenProperty must have either value or kind"); - } - TokenProperty { value, kind } - } - } - - impl From for TokenProperty { - fn from(value: i32) -> TokenProperty { - TokenProperty { - value: Some(value.to_string()), - kind: None, - } - } - } - - impl From for TokenProperty { - fn from(value: u32) -> TokenProperty { - TokenProperty { - value: Some(value.to_string()), - kind: None, - } - } - } - - - impl From for TokenProperty { - fn from(value: i64) -> TokenProperty { - TokenProperty { - value: Some(value.to_string()), - kind: None, - } - } - } - - impl From for TokenProperty { - fn from(value: u64) -> TokenProperty { - TokenProperty { - value: Some(value.to_string()), - kind: None, - } - } - } - - impl From for TokenProperty { - fn from(value: f64) -> TokenProperty { - TokenProperty { - value: Some(value.to_string()), - kind: None, - } - } - } - - impl From for TokenProperty { - fn from(value: bool) -> TokenProperty { - TokenProperty { - value: Some(value.to_string()), - kind: None, - } - } - } - - impl From for TokenProperty { - fn from(value: String) -> TokenProperty { - assert!(value.len() > 0, "String property value has length 0"); - TokenProperty { - value: Some(value.to_lowercase()), - kind: None, - } - } - } - - - impl From<&pg_query::protobuf::Integer> for TokenProperty { - fn from(node: &pg_query::protobuf::Integer) -> TokenProperty { - TokenProperty { - value: Some(node.ival.to_string()), - kind: Some(SyntaxKind::Iconst) - } - } - } - - impl From<&pg_query::protobuf::Boolean> for TokenProperty { - fn from(node: &pg_query::protobuf::Boolean) -> TokenProperty { - TokenProperty { - value: Some(node.boolval.to_string()), - kind: match node.boolval { - true => Some(SyntaxKind::TrueP), - false => Some(SyntaxKind::FalseP), - } - } - } - } - - impl From for TokenProperty { - fn from(kind: SyntaxKind) -> TokenProperty { - TokenProperty { - value: None, - kind: Some(kind), - } - } - } - - impl From for TokenProperty { - fn from(token: Token) -> TokenProperty { - TokenProperty { - value: None, - kind: Some(SyntaxKind::from(token)), - } - } - } - - pub fn get_node_properties(node: &NodeEnum, parent: Option<&NodeEnum>) -> Vec { - let mut tokens: Vec = Vec::new(); - - match node { - #(NodeEnum::#node_identifiers(n) => {#node_handlers}),*, - }; - - tokens - } - - } -} - -fn node_identifiers(nodes: &[Node]) -> Vec { - nodes - .iter() - .map(|node| format_ident!("{}", &node.name)) - .collect() -} - -fn node_handlers(nodes: &[Node]) -> Vec { - nodes - .iter() - .map(|node| { - let string_property_handlers = string_property_handlers(node); - let custom_handlers = custom_handlers(node); - quote! { - #custom_handlers - #(#string_property_handlers)* - } - }) - .collect() -} - -fn custom_handlers(node: &Node) -> TokenStream { - match node.name.as_str() { - "SelectStmt" => quote! { - tokens.push(TokenProperty::from(Token::Select)); - if n.distinct_clause.len() > 0 { - tokens.push(TokenProperty::from(Token::Distinct)); - } - if n.values_lists.len() > 0 { - tokens.push(TokenProperty::from(Token::Values)); - } - if n.from_clause.len() > 0 { - tokens.push(TokenProperty::from(Token::From)); - } - if n.where_clause.is_some() { - tokens.push(TokenProperty::from(Token::Where)); - } - if n.group_clause.len() > 0 { - tokens.push(TokenProperty::from(Token::GroupP)); - tokens.push(TokenProperty::from(Token::By)); - } - match n.op() { - protobuf::SetOperation::Undefined => {}, - protobuf::SetOperation::SetopNone => {}, - protobuf::SetOperation::SetopUnion => tokens.push(TokenProperty::from(Token::Union)), - protobuf::SetOperation::SetopIntersect => tokens.push(TokenProperty::from(Token::Intersect)), - protobuf::SetOperation::SetopExcept => tokens.push(TokenProperty::from(Token::Except)), - _ => panic!("Unknown SelectStmt op {:#?}", n.op()), - } - if n.all { - tokens.push(TokenProperty::from(Token::All)); - } - }, - "BoolExpr" => quote! { - match n.boolop() { - protobuf::BoolExprType::AndExpr => tokens.push(TokenProperty::from(Token::And)), - protobuf::BoolExprType::OrExpr => tokens.push(TokenProperty::from(Token::Or)), - protobuf::BoolExprType::NotExpr => tokens.push(TokenProperty::from(Token::Not)), - _ => panic!("Unknown BoolExpr {:#?}", n.boolop()), - } - }, - "JoinExpr" => quote! { - tokens.push(TokenProperty::from(Token::Join)); - tokens.push(TokenProperty::from(Token::On)); - match n.jointype() { - protobuf::JoinType::JoinInner => tokens.push(TokenProperty::from(Token::InnerP)), - protobuf::JoinType::JoinLeft => tokens.push(TokenProperty::from(Token::Left)), - protobuf::JoinType::JoinFull => tokens.push(TokenProperty::from(Token::Full)), - protobuf::JoinType::JoinRight => tokens.push(TokenProperty::from(Token::Right)), - _ => panic!("Unknown JoinExpr jointype {:#?}", n.jointype()), - } - - }, - "ResTarget" => quote! { - if n.name.len() > 0 { - tokens.push(TokenProperty::from(Token::As)); - } - }, - "Integer" => quote! { - tokens.push(TokenProperty::from(n)); - }, - "DefElem" => quote! { - match n.defname.as_str() { - "location" => { - tokens.push(TokenProperty::from(Token::Default)); - }, - "connection_limit" => { - tokens.push(TokenProperty::from(Token::Limit)); - tokens.push(TokenProperty::from(Token::Iconst)); - }, - "owner" => { - tokens.push(TokenProperty::from(Token::Owner)); - } - _ => {} - } - match n.defaction() { - protobuf::DefElemAction::DefelemUnspec => tokens.push(TokenProperty::from(Token::Ascii61)), - _ => panic!("Unknown DefElem {:#?}", n.defaction()), - } - }, - "Alias" => quote! { - tokens.push(TokenProperty::from(Token::As)); - }, - "CollateClause" => quote! { - tokens.push(TokenProperty::from(Token::Collate)); - }, - "AExpr" => quote! { - match n.kind() { - protobuf::AExprKind::AexprOp => {}, // do nothing - protobuf::AExprKind::AexprOpAny => tokens.push(TokenProperty::from(Token::Any)), - protobuf::AExprKind::AexprIn => tokens.push(TokenProperty::from(Token::InP)), - _ => panic!("Unknown AExpr kind {:#?}", n.kind()), - } - }, - "WindowDef" => quote! { - if n.partition_clause.len() > 0 || n.order_clause.len() > 0 { - tokens.push(TokenProperty::from(Token::Window)); - tokens.push(TokenProperty::from(Token::As)); - } - if n.partition_clause.len() > 0 { - tokens.push(TokenProperty::from(Token::Partition)); - tokens.push(TokenProperty::from(Token::By)); - } - }, - "Boolean" => quote! { - tokens.push(TokenProperty::from(n)); - }, - "AStar" => quote! { - tokens.push(TokenProperty::from(Token::Ascii42)); - }, - "FuncCall" => quote! { - if n.funcname.len() == 1 && n.args.len() == 0 { - // check if count(*) - if let Some(node) = &n.funcname[0].node { - if let NodeEnum::String(n) = node { - if n.sval == "count" { - tokens.push(TokenProperty::from(Token::Ascii42)); - } - } - } - } - if n.agg_filter.is_some() { - tokens.push(TokenProperty::from(Token::Filter)); - tokens.push(TokenProperty::from(Token::Where)); - } - if n.over.is_some() { - tokens.push(TokenProperty::from(Token::Over)); - } - }, - "SqlvalueFunction" => quote! { - match n.op() { - protobuf::SqlValueFunctionOp::SvfopCurrentRole => tokens.push(TokenProperty::from(Token::CurrentRole)), - protobuf::SqlValueFunctionOp::SvfopCurrentUser => tokens.push(TokenProperty::from(Token::CurrentUser)), - _ => panic!("Unknown SqlvalueFunction {:#?}", n.op()), - } - }, - "SortBy" => quote! { - tokens.push(TokenProperty::from(Token::Order)); - tokens.push(TokenProperty::from(Token::By)); - match n.sortby_dir() { - protobuf::SortByDir::SortbyAsc => tokens.push(TokenProperty::from(Token::Asc)), - protobuf::SortByDir::SortbyDesc => tokens.push(TokenProperty::from(Token::Desc)), - _ => {} - } - }, - "AConst" => quote! { - if n.isnull { - tokens.push(TokenProperty::from(Token::NullP)); - } - }, - "AlterTableStmt" => quote! { - tokens.push(TokenProperty::from(Token::Alter)); - tokens.push(TokenProperty::from(Token::Table)); - }, - "AlterTableCmd" => quote! { - match n.subtype() { - protobuf::AlterTableType::AtColumnDefault => { - tokens.push(TokenProperty::from(Token::Alter)); - tokens.push(TokenProperty::from(Token::Column)); - tokens.push(TokenProperty::from(Token::Set)); - tokens.push(TokenProperty::from(Token::Default)); - }, - protobuf::AlterTableType::AtAddConstraint => tokens.push(TokenProperty::from(Token::AddP)), - protobuf::AlterTableType::AtAlterColumnType => { - tokens.push(TokenProperty::from(Token::Alter)); - tokens.push(TokenProperty::from(Token::Column)); - tokens.push(TokenProperty::from(Token::TypeP)); - }, - protobuf::AlterTableType::AtDropColumn => { - tokens.push(TokenProperty::from(Token::Drop)); - tokens.push(TokenProperty::from(Token::Column)); - }, - _ => panic!("Unknown AlterTableCmd {:#?}", n.subtype()), - } - }, - "VariableSetStmt" => quote! { - tokens.push(TokenProperty::from(Token::Set)); - match n.kind() { - protobuf::VariableSetKind::VarSetValue => tokens.push(TokenProperty::from(Token::To)), - _ => panic!("Unknown VariableSetStmt {:#?}", n.kind()), - } - }, - "CreatePolicyStmt" => quote! { - tokens.push(TokenProperty::from(Token::Create)); - tokens.push(TokenProperty::from(Token::Policy)); - tokens.push(TokenProperty::from(Token::On)); - if n.roles.len() > 0 { - tokens.push(TokenProperty::from(Token::To)); - } - if n.qual.is_some() { - tokens.push(TokenProperty::from(Token::Using)); - } - if n.with_check.is_some() { - tokens.push(TokenProperty::from(Token::With)); - tokens.push(TokenProperty::from(Token::Check)); - } - }, - "CopyStmt" => quote! { - tokens.push(TokenProperty::from(Token::Copy)); - tokens.push(TokenProperty::from(Token::From)); - }, - "RenameStmt" => quote! { - tokens.push(TokenProperty::from(Token::Alter)); - tokens.push(TokenProperty::from(Token::Table)); - tokens.push(TokenProperty::from(Token::Rename)); - tokens.push(TokenProperty::from(Token::To)); - }, - "Constraint" => quote! { - match n.contype() { - protobuf::ConstrType::ConstrNotnull => { - tokens.push(TokenProperty::from(Token::Not)); - tokens.push(TokenProperty::from(Token::NullP)); - }, - protobuf::ConstrType::ConstrDefault => tokens.push(TokenProperty::from(Token::Default)), - protobuf::ConstrType::ConstrCheck => tokens.push(TokenProperty::from(Token::Check)), - protobuf::ConstrType::ConstrPrimary => { - tokens.push(TokenProperty::from(Token::Primary)); - tokens.push(TokenProperty::from(Token::Key)); - }, - protobuf::ConstrType::ConstrForeign => tokens.push(TokenProperty::from(Token::References)), - protobuf::ConstrType::ConstrUnique => tokens.push(TokenProperty::from(Token::Unique)), - _ => panic!("Unknown Constraint {:#?}", n.contype()), - }; - if n.options.len() > 0 { - tokens.push(TokenProperty::from(Token::With)); - } - }, - "PartitionSpec" => quote! { - tokens.push(TokenProperty::from(Token::Partition)); - tokens.push(TokenProperty::from(Token::By)); - }, - "InsertStmt" => quote! { - tokens.push(TokenProperty::from(Token::Insert)); - tokens.push(TokenProperty::from(Token::Into)); - }, - "DeleteStmt" => quote! { - tokens.push(TokenProperty::from(Token::DeleteP)); - tokens.push(TokenProperty::from(Token::From)); - if n.where_clause.is_some() { - tokens.push(TokenProperty::from(Token::Where)); - } - if n.using_clause.len() > 0 { - tokens.push(TokenProperty::from(Token::Using)); - } - }, - "ViewStmt" => quote! { - tokens.push(TokenProperty::from(Token::Create)); - tokens.push(TokenProperty::from(Token::View)); - if n.query.is_some() { - tokens.push(TokenProperty::from(Token::As)); - // check if SelectStmt with WithClause with recursive set to true - if let Some(NodeEnum::SelectStmt(select_stmt)) = n.query.as_ref().and_then(|query| query.node.as_ref()) { - if select_stmt.with_clause.is_some() && select_stmt.with_clause.as_ref().unwrap().recursive { - tokens.push(TokenProperty::from(Token::Recursive)); - } - } - } - if n.replace { - tokens.push(TokenProperty::from(Token::Or)); - tokens.push(TokenProperty::from(Token::Replace)); - } - if let Some(n) = &n.view { - match n.relpersistence.as_str() { - // Temporary - "t" => tokens.push(TokenProperty::from(Token::Temporary)), - _ => {}, - } - } - match n.with_check_option() { - protobuf::ViewCheckOption::LocalCheckOption => { - tokens.push(TokenProperty::from(Token::With)); - tokens.push(TokenProperty::from(Token::Local)); - tokens.push(TokenProperty::from(Token::Check)); - tokens.push(TokenProperty::from(Token::Option)); - }, - protobuf::ViewCheckOption::CascadedCheckOption => { - tokens.push(TokenProperty::from(Token::With)); - tokens.push(TokenProperty::from(Token::Cascaded)); - tokens.push(TokenProperty::from(Token::Check)); - tokens.push(TokenProperty::from(Token::Option)); - }, - _ => {} - } - }, - "CreateStmt" => quote! { - tokens.push(TokenProperty::from(Token::Create)); - tokens.push(TokenProperty::from(Token::Table)); - if n.tablespacename.len() > 0 { - tokens.push(TokenProperty::from(Token::Tablespace)); - } - if n.options.len() > 0 { - tokens.push(TokenProperty::from(Token::With)); - } - if n.if_not_exists { - tokens.push(TokenProperty::from(Token::IfP)); - tokens.push(TokenProperty::from(Token::Not)); - tokens.push(TokenProperty::from(Token::Exists)); - } - if n.partbound.is_some() { - tokens.push(TokenProperty::from(Token::Partition)); - tokens.push(TokenProperty::from(Token::Of)); - tokens.push(TokenProperty::from(Token::For)); - tokens.push(TokenProperty::from(Token::Values)); - } - if let Some(n) = &n.relation { - match n.relpersistence.as_str() { - // Unlogged - "u" => tokens.push(TokenProperty::from(Token::Unlogged)), - // Temporary - "t" => tokens.push(TokenProperty::from(Token::Temporary)), - _ => {}, - } - if n.inh { - tokens.push(TokenProperty::from(Token::Inherits)); - } - } - }, - "TableLikeClause" => quote! { - tokens.push(TokenProperty::from(Token::Like)); - // CREATE_TABLE_LIKE_ALL - if n.options == 0x7FFFFFFF { - tokens.push(TokenProperty::from(Token::Including)); - tokens.push(TokenProperty::from(Token::All)); - } else { - tokens.push(TokenProperty::from(Token::Excluding)); - tokens.push(TokenProperty::from(Token::All)); - } - }, - "TransactionStmt" => quote! { - match n.kind() { - protobuf::TransactionStmtKind::TransStmtBegin => tokens.push(TokenProperty::from(Token::BeginP)), - protobuf::TransactionStmtKind::TransStmtCommit => tokens.push(TokenProperty::from(Token::Commit)), - _ => panic!("Unknown TransactionStmt {:#?}", n.kind()) - } - }, - "PartitionBoundSpec" => quote! { - tokens.push(TokenProperty::from(Token::From)); - tokens.push(TokenProperty::from(Token::To)); - }, - "CaseExpr" => quote! { - tokens.push(TokenProperty::from(Token::Case)); - tokens.push(TokenProperty::from(Token::EndP)); - if n.defresult.is_some() { - tokens.push(TokenProperty::from(Token::Else)); - } - }, - "NullTest" => quote! { - match n.nulltesttype() { - protobuf::NullTestType::IsNull => tokens.push(TokenProperty::from(Token::Is)), - protobuf::NullTestType::IsNotNull => { - tokens.push(TokenProperty::from(Token::Is)); - tokens.push(TokenProperty::from(Token::Not)); - }, - _ => panic!("Unknown NullTest {:#?}", n.nulltesttype()), - } - tokens.push(TokenProperty::from(Token::NullP)); - }, - "CreateFunctionStmt" => quote! { - tokens.push(TokenProperty::from(Token::Create)); - if n.is_procedure { - tokens.push(TokenProperty::from(Token::Procedure)); - } else { - tokens.push(TokenProperty::from(Token::Function)); - } - if n.replace { - tokens.push(TokenProperty::from(Token::Or)); - tokens.push(TokenProperty::from(Token::Replace)); - } - if let Some(return_type) = &n.return_type { - tokens.push(TokenProperty::from(Token::Returns)); - if return_type.setof { - tokens.push(TokenProperty::from(Token::Setof)); - } - } - for option in &n.options { - if let Some(NodeEnum::DefElem(node)) = &option.node { - if node.defname == "strict" { - if let Some(NodeEnum::Boolean(node)) = - node.arg.as_ref().and_then(|arg| arg.node.as_ref()) - { - if node.boolval { - tokens.push(TokenProperty::from(Token::NullP)); - tokens.push(TokenProperty::from(Token::On)); - tokens.push(TokenProperty::from(Token::NullP)); - tokens.push(TokenProperty::from(Token::InputP)); - } else { - tokens.push(TokenProperty::from(Token::On)); - tokens.push(TokenProperty::from(Token::NullP)); - tokens.push(TokenProperty::from(Token::InputP)); - } - } - } - } - } - }, - "FunctionParameter" => quote! { - match n.mode() { - protobuf::FunctionParameterMode::FuncParamIn => tokens.push(TokenProperty::from(Token::InP)), - protobuf::FunctionParameterMode::FuncParamOut => tokens.push(TokenProperty::from(Token::OutP)), - protobuf::FunctionParameterMode::FuncParamInout => tokens.push(TokenProperty::from(Token::Inout)), - protobuf::FunctionParameterMode::FuncParamVariadic => tokens.push(TokenProperty::from(Token::Variadic)), - // protobuf::FunctionParameterMode::FuncParamTable => tokens.push(TokenProperty::from(Token::Table)), - protobuf::FunctionParameterMode::FuncParamDefault => {}, // do nothing - _ => panic!("Unknown FunctionParameter {:#?}", n.mode()), - }; - if n.defexpr.is_some() { - tokens.push(TokenProperty::from(Token::Default)); - } - }, - "NamedArgExpr" => quote! { - // => - tokens.push(TokenProperty::from(Token::EqualsGreater)); - }, - "CaseWhen" => quote! { - tokens.push(TokenProperty::from(Token::When)); - tokens.push(TokenProperty::from(Token::Then)); - }, - "TypeCast" => quote! { - tokens.push(TokenProperty::from(Token::Typecast)); - }, - "CreateDomainStmt" => quote! { - tokens.push(TokenProperty::from(Token::Create)); - tokens.push(TokenProperty::from(Token::DomainP)); - if n.type_name.is_some() { - tokens.push(TokenProperty::from(Token::As)); - } - }, - "List" => quote! { - if parent.is_some() { - // if parent is `DefineStmt`, we need to check whether an ORDER BY needs to be added - if let NodeEnum::DefineStmt(define_stmt) = parent.unwrap() { - // there *seems* to be an integer node in the last position of the DefineStmt args that - // defines whether the list contains an order by statement - let integer = define_stmt.args.last() - .and_then(|node| node.node.as_ref()) - .and_then(|node| if let NodeEnum::Integer(n) = node { Some(n.ival) } else { None }); - if integer.is_none() { - panic!("DefineStmt of type ObjectAggregate has no integer node in last position of args"); - } - // if the integer is 1, then there is an order by statement - // we add it to the `List` node because that seems to make most sense based off the grammar definition - // ref: https://github.com/postgres/postgres/blob/REL_15_STABLE/src/backend/parser/gram.y#L8355 - // ``` - // aggr_args: - // | '(' aggr_args_list ORDER BY aggr_args_list ')' - // ``` - if integer.unwrap() == 1 { - tokens.push(TokenProperty::from(Token::Order)); - tokens.push(TokenProperty::from(Token::By)); - } - } - } - }, - "DefineStmt" => quote! { - tokens.push(TokenProperty::from(Token::Create)); - if n.replace { - tokens.push(TokenProperty::from(Token::Or)); - tokens.push(TokenProperty::from(Token::Replace)); - } - match n.kind() { - protobuf::ObjectType::ObjectAggregate => { - tokens.push(TokenProperty::from(Token::Aggregate)); - - // n.args is always an array with two nodes - assert_eq!(n.args.len(), 2, "DefineStmt of type ObjectAggregate does not have exactly 2 args"); - // the first is either a List or a Node { node: None } - - if let Some(node) = &n.args.first() { - if node.node.is_none() { - // if first element is a Node { node: None }, then it's "*" - tokens.push(TokenProperty::from(Token::Ascii42)); - } } - // if its a list, we handle it in the handler for `List` - }, - protobuf::ObjectType::ObjectType => { - tokens.push(TokenProperty::from(Token::TypeP)); - }, - _ => panic!("Unknown DefineStmt {:#?}", n.kind()), - } - }, - "CreateSchemaStmt" => quote! { - tokens.push(TokenProperty::from(Token::Create)); - tokens.push(TokenProperty::from(Token::Schema)); - if n.if_not_exists { - tokens.push(TokenProperty::from(Token::IfP)); - tokens.push(TokenProperty::from(Token::Not)); - tokens.push(TokenProperty::from(Token::Exists)); - } - if n.authrole.is_some() { - tokens.push(TokenProperty::from(Token::Authorization)); - } - }, - "CreateEnumStmt" => quote! { - tokens.push(TokenProperty::from(Token::Create)); - tokens.push(TokenProperty::from(Token::TypeP)); - tokens.push(TokenProperty::from(Token::As)); - tokens.push(TokenProperty::from(Token::EnumP)); - }, - "CreateCastStmt" => quote! { - tokens.push(TokenProperty::from(Token::Create)); - tokens.push(TokenProperty::from(Token::Cast)); - tokens.push(TokenProperty::from(Token::As)); - if n.inout { - tokens.push(TokenProperty::from(Token::With)); - tokens.push(TokenProperty::from(Token::Inout)); - } else if n.func.is_some() { - tokens.push(TokenProperty::from(Token::With)); - tokens.push(TokenProperty::from(Token::Function)); - } else { - tokens.push(TokenProperty::from(Token::Without)); - tokens.push(TokenProperty::from(Token::Function)); - } - match n.context() { - protobuf::CoercionContext::CoercionImplicit => { - tokens.push(TokenProperty::from(Token::As)); - tokens.push(TokenProperty::from(Token::ImplicitP)); - }, - protobuf::CoercionContext::CoercionAssignment => { - tokens.push(TokenProperty::from(Token::As)); - tokens.push(TokenProperty::from(Token::Assignment)); - }, - protobuf::CoercionContext::CoercionPlpgsql => {}, - protobuf::CoercionContext::CoercionExplicit => {}, - _ => panic!("Unknown CreateCastStmt {:#?}", n.context()) - } - }, - "CreateRangeStmt" => quote! { - tokens.push(TokenProperty::from(Token::Create)); - tokens.push(TokenProperty::from(Token::TypeP)); - tokens.push(TokenProperty::from(Token::As)); - tokens.push(TokenProperty::from(Token::Range)); - }, - "IndexStmt" => quote! { - tokens.push(TokenProperty::from(Token::Create)); - if n.unique { - tokens.push(TokenProperty::from(Token::Unique)); - } - tokens.push(TokenProperty::from(Token::Index)); - if n.concurrent { - tokens.push(TokenProperty::from(Token::Concurrently)); - } - if n.if_not_exists { - tokens.push(TokenProperty::from(Token::IfP)); - tokens.push(TokenProperty::from(Token::Not)); - tokens.push(TokenProperty::from(Token::Exists)); - } - tokens.push(TokenProperty::from(Token::On)); - // access_method is btree by default - if n.access_method.len() > 0 { - tokens.push(TokenProperty::from(Token::Using)); - } - if n.index_including_params.len() > 0 { - tokens.push(TokenProperty::from(Token::Include)); - } - if n.options.len() > 0 { - tokens.push(TokenProperty::from(Token::With)); - } - // table_space is an empty string by default - if n.table_space.len() > 0 { - tokens.push(TokenProperty::from(Token::Tablespace)); - } - }, - "IndexElem" => quote! { - if n.collation.len() > 0 { - tokens.push(TokenProperty::from(Token::Collate)); - } - match n.nulls_ordering() { - protobuf::SortByNulls::SortbyNullsDefault => {}, - protobuf::SortByNulls::SortbyNullsFirst => { - tokens.push(TokenProperty::from(Token::NullsP)); - tokens.push(TokenProperty::from(Token::FirstP)); - }, - protobuf::SortByNulls::SortbyNullsLast => { - tokens.push(TokenProperty::from(Token::NullsP)); - tokens.push(TokenProperty::from(Token::LastP)); - }, - _ => panic!("Unknown IndexElem {:#?}", n.nulls_ordering()), - } - }, - "CreateTableSpaceStmt" => quote! { - tokens.push(TokenProperty::from(Token::Create)); - tokens.push(TokenProperty::from(Token::Tablespace)); - tokens.push(TokenProperty::from(Token::Location)); - if n.owner.is_some() { - tokens.push(TokenProperty::from(Token::Owner)); - } - if n.options.len() > 0 { - tokens.push(TokenProperty::from(Token::With)); - } - }, - "CreatePublicationStmt" => quote! { - tokens.push(TokenProperty::from(Token::Create)); - tokens.push(TokenProperty::from(Token::Publication)); - if n.for_all_tables { - tokens.push(TokenProperty::from(Token::For)); - tokens.push(TokenProperty::from(Token::All)); - tokens.push(TokenProperty::from(Token::Tables)); - } - if let Some(n) = n.options.first() { - tokens.push(TokenProperty::from(Token::With)); - } - if let Some(n) = n.pubobjects.first() { - tokens.push(TokenProperty::from(Token::For)); - if let Some(NodeEnum::PublicationObjSpec(n)) = &n.node { - match n.pubobjtype() { - protobuf::PublicationObjSpecType::PublicationobjTable => { - tokens.push(TokenProperty::from(Token::Table)); - }, - protobuf::PublicationObjSpecType::PublicationobjTablesInSchema => { - tokens.push(TokenProperty::from(Token::Tables)); - tokens.push(TokenProperty::from(Token::InP)); - tokens.push(TokenProperty::from(Token::Schema)); - }, - _ => panic!("Unknown CreatePublicationStmt {:#?}", n.pubobjtype()) - } - } - } - if let Some(n) = n.pubobjects.last() { - if let Some(NodeEnum::PublicationObjSpec(n)) = &n.node { - match n.pubobjtype() { - protobuf::PublicationObjSpecType::PublicationobjTablesInSchema => { - tokens.push(TokenProperty::from(Token::Tables)); - tokens.push(TokenProperty::from(Token::InP)); - tokens.push(TokenProperty::from(Token::Schema)); - }, - _ => {} - } - } - } - }, - "PublicationTable" => quote! { - if n.where_clause.is_some() { - tokens.push(TokenProperty::from(Token::Where)); - } - }, - "BooleanTest" => quote! { - match n.booltesttype() { - protobuf::BoolTestType::IsTrue => { - tokens.push(TokenProperty::from(Token::Is)); - tokens.push(TokenProperty::from(Token::TrueP)); - }, - protobuf::BoolTestType::IsNotTrue => { - tokens.push(TokenProperty::from(Token::Is)); - tokens.push(TokenProperty::from(Token::Not)); - tokens.push(TokenProperty::from(Token::TrueP)); - }, - protobuf::BoolTestType::IsFalse => { - tokens.push(TokenProperty::from(Token::Is)); - tokens.push(TokenProperty::from(Token::FalseP)); - }, - protobuf::BoolTestType::IsNotFalse => { - tokens.push(TokenProperty::from(Token::Is)); - tokens.push(TokenProperty::from(Token::Not)); - tokens.push(TokenProperty::from(Token::FalseP)); - }, - _ => panic!("Unknown BooleanTest {:#?}", n.booltesttype()), - } - }, - "CompositeTypeStmt" => quote! { - tokens.push(TokenProperty::from(Token::Create)); - tokens.push(TokenProperty::from(Token::TypeP)); - tokens.push(TokenProperty::from(Token::As)); - }, - "CreatedbStmt" => quote! { - tokens.push(TokenProperty::from(Token::Create)); - tokens.push(TokenProperty::from(Token::Database)); - }, - "CreateExtensionStmt" => quote! { - tokens.push(TokenProperty::from(Token::Create)); - tokens.push(TokenProperty::from(Token::Extension)); - if n.if_not_exists { - tokens.push(TokenProperty::from(Token::IfP)); - tokens.push(TokenProperty::from(Token::Not)); - tokens.push(TokenProperty::from(Token::Exists)); - } - }, - "CreateConversionStmt" => quote! { - tokens.push(TokenProperty::from(Token::Create)); - if n.def { - tokens.push(TokenProperty::from(Token::Default)); - } - tokens.push(TokenProperty::from(Token::ConversionP)); - if n.for_encoding_name.len() > 0 { - tokens.push(TokenProperty::from(Token::For)); - } - if n.to_encoding_name.len() > 0 { - tokens.push(TokenProperty::from(Token::To)); - } - if n.func_name.len() == 1 { - tokens.push(TokenProperty::from(Token::From)); - } else if n.func_name.len() > 1 { - panic!("Encountered multiple defined func_name elements in CreateConversionStmt"); - } - }, - "CreateTransformStmt" => quote! { - tokens.push(TokenProperty::from(Token::Create)); - if n.replace { - tokens.push(TokenProperty::from(Token::Or)); - tokens.push(TokenProperty::from(Token::Replace)); - } - tokens.push(TokenProperty::from(Token::Transform)); - if n.type_name.is_some() { - tokens.push(TokenProperty::from(Token::For)); - } - tokens.push(TokenProperty::from(Token::Language)); - if n.fromsql.is_some() { - tokens.push(TokenProperty::from(Token::From)); - tokens.push(TokenProperty::from(Token::SqlP)); - tokens.push(TokenProperty::from(Token::With)); - tokens.push(TokenProperty::from(Token::Function)); - } - if n.tosql.is_some() { - tokens.push(TokenProperty::from(Token::To)); - tokens.push(TokenProperty::from(Token::SqlP)); - tokens.push(TokenProperty::from(Token::With)); - tokens.push(TokenProperty::from(Token::Function)); - } - }, - "TypeName" => quote! { - let names = n.names - .iter() - .filter_map(|n| if let Some(NodeEnum::String(s)) = &n.node { Some(s.sval.clone()) } else { None }) - .collect::>(); - - if names.len() == 2 && names[0] == "pg_catalog" { - match names[1].as_str() { - "float8" => { - tokens.push(TokenProperty::from(Token::DoubleP)); - tokens.push(TokenProperty::from(Token::Precision)); - }, - "interval" => { - // Adapted from https://github.com/postgres/postgres/blob/REL_15_STABLE/src/backend/utils/adt/timestamp.c#L1103 - const MONTH: i32 = 1; - const YEAR: i32 = 2; - const DAY: i32 = 3; - const HOUR: i32 = 10; - const MINUTE: i32 = 11; - const SECOND: i32 = 12; - - let fields = &n.typmods.first() - .and_then(|node| node.node.as_ref()) - .and_then(|node| if let NodeEnum::AConst(n) = node { n.val.clone() } else { None }) - .and_then(|node| if let protobuf::a_const::Val::Ival(n) = node { Some(n.ival) } else { None }); - - if let Some(fields) = fields { - match fields.clone() { - // YEAR TO MONTH - i if i == 1 << YEAR | 1 << MONTH => { - tokens.push(TokenProperty::from(Token::To)); - tokens.push(TokenProperty::from(Token::MonthP)); - }, - // DAY TO HOUR - i if i == 1 << DAY | 1 << HOUR => { - tokens.push(TokenProperty::from(Token::To)); - tokens.push(TokenProperty::from(Token::HourP)); - }, - // DAY TO MINUTE - i if i == 1 << DAY | 1 << HOUR | 1 << MINUTE => { - tokens.push(TokenProperty::from(Token::To)); - tokens.push(TokenProperty::from(Token::MinuteP)); - }, - // DAY TO SECOND - i if i == 1 << DAY | 1 << HOUR | 1 << MINUTE | 1 << SECOND => { - tokens.push(TokenProperty::from(Token::To)); - tokens.push(TokenProperty::from(Token::SecondP)); - }, - // HOUR TO MINUTE - i if i == 1 << HOUR | 1 << MINUTE => { - tokens.push(TokenProperty::from(Token::To)); - tokens.push(TokenProperty::from(Token::MinuteP)); - }, - // HOUR TO SECOND - i if i == 1 << HOUR | 1 << MINUTE | 1 << SECOND => { - tokens.push(TokenProperty::from(Token::To)); - tokens.push(TokenProperty::from(Token::SecondP)); - }, - // MINUTE TO SECOND - i if i == 1 << MINUTE | 1 << SECOND => { - tokens.push(TokenProperty::from(Token::To)); - tokens.push(TokenProperty::from(Token::SecondP)); - }, - _ => panic!("Unknown Interval fields {:#?}", fields), - } - } - }, - "timestamptz" => { - tokens.push(TokenProperty::from(Token::Timestamp)); - tokens.push(TokenProperty::from(Token::With)); - tokens.push(TokenProperty::from(Token::Time)); - tokens.push(TokenProperty::from(Token::Zone)); - } - "timetz" => { - tokens.push(TokenProperty::from(Token::Time)); - tokens.push(TokenProperty::from(Token::With)); - tokens.push(TokenProperty::from(Token::Time)); - tokens.push(TokenProperty::from(Token::Zone)); - } - _ => {} - } - } - }, - "TruncateStmt" => quote! { - tokens.push(TokenProperty::from(Token::Truncate)); - tokens.push(TokenProperty::from(Token::Table)); - if n.restart_seqs { - tokens.push(TokenProperty::from(Token::Restart)); - tokens.push(TokenProperty::from(Token::IdentityP)); - } else { - tokens.push(TokenProperty::from(Token::ContinueP)); - tokens.push(TokenProperty::from(Token::IdentityP)); - } - match n.behavior { - // DropRestrict - 1 => tokens.push(TokenProperty::from(Token::Restrict)), - // DropCascade - 2 => tokens.push(TokenProperty::from(Token::Cascade)), - _ => {} - } - }, - _ => quote! {}, - } -} - -fn string_property_handlers(node: &Node) -> Vec { - node.fields - .iter() - .filter_map(|field| { - if field.repeated { - return None; - } - let field_name = format_ident!("{}", field.name.as_str()); - match field.field_type { - // just handle string values for now - FieldType::String => Some(quote! { - // most string values are never None, but an empty string - if n.#field_name.len() > 0 { - tokens.push(TokenProperty::from(n.#field_name.to_owned())); - } - }), - _ => None, - } - }) - .collect() -} diff --git a/crates/pgt_query_ext_codegen/src/get_nodes.rs b/crates/pgt_query_ext_codegen/src/get_nodes.rs deleted file mode 100644 index e0381331..00000000 --- a/crates/pgt_query_ext_codegen/src/get_nodes.rs +++ /dev/null @@ -1,141 +0,0 @@ -use pgt_query_proto_parser::{FieldType, Node, ProtoFile}; -use proc_macro2::{Ident, TokenStream}; -use quote::{format_ident, quote}; - -pub fn get_nodes_mod(proto_file: &ProtoFile) -> proc_macro2::TokenStream { - let manual_node_names = manual_node_names(); - - let node_identifiers = node_identifiers(&proto_file.nodes, &manual_node_names); - let node_handlers = node_handlers(&proto_file.nodes, &manual_node_names); - - quote! { - #[derive(Debug, Clone)] - pub struct Node { - pub inner: NodeEnum, - pub depth: usize, - pub properties: Vec, - pub location: Option, - } - - /// Returns all children of the node, recursively - /// location is resolved manually - pub fn get_nodes(node: &NodeEnum) -> StableGraph { - let mut g = StableGraph::::new(); - - let root_node_idx = g.add_node(Node { - inner: node.to_owned(), - depth: 0, - properties: get_node_properties(node, None), - location: get_location(node), - }); - - // Parent node idx, Node, depth - let mut stack: VecDeque<(NodeIndex, NodeEnum, usize)> = - VecDeque::from(vec![(root_node_idx, node.to_owned(), 0)]); - while !stack.is_empty() { - let (parent_idx, node, depth) = stack.pop_front().unwrap(); - let current_depth = depth + 1; - let mut handle_child = |c: NodeEnum| { - if match &c { - // all "simple nodes" are not handled individually but merged with their parent - NodeEnum::String(n) => true, - NodeEnum::Integer(n) => true, - NodeEnum::Float(n) => true, - NodeEnum::Boolean(n) => true, - NodeEnum::BitString(n) => true, - _ => false - } { - g[parent_idx].properties.extend(get_node_properties(&c, Some(&node))); - } else { - let node_idx = g.add_node(Node { - depth: current_depth, - properties: get_node_properties(&c, Some(&node)), - location: get_location(&c), - inner: c.to_owned(), - }); - g.add_edge(parent_idx, node_idx, ()); - stack.push_back((node_idx, c.to_owned(), current_depth)); - } - }; - match &node { - // `AConst` is the only node with a `one of` property, so we handle it manually - // if you need to handle other nodes manually, add them to the `manual_node_names` function below - NodeEnum::AConst(n) => { - if n.val.is_some() { - handle_child(match n.val.to_owned().unwrap() { - pg_query::protobuf::a_const::Val::Ival(v) => NodeEnum::Integer(v), - pg_query::protobuf::a_const::Val::Fval(v) => NodeEnum::Float(v), - pg_query::protobuf::a_const::Val::Boolval(v) => NodeEnum::Boolean(v), - pg_query::protobuf::a_const::Val::Sval(v) => NodeEnum::String(v), - pg_query::protobuf::a_const::Val::Bsval(v) => NodeEnum::BitString(v), - }); - } - } - #(NodeEnum::#node_identifiers(n) => {#node_handlers}),*, - }; - } - g - } - } -} - -fn manual_node_names() -> Vec<&'static str> { - vec!["AConst"] -} - -fn node_identifiers(nodes: &[Node], exclude_nodes: &[&str]) -> Vec { - nodes - .iter() - .filter(|node| !exclude_nodes.contains(&node.name.as_str())) - .map(|node| format_ident!("{}", &node.name)) - .collect() -} - -fn node_handlers(nodes: &[Node], exclude_nodes: &[&str]) -> Vec { - nodes - .iter() - .filter(|node| !exclude_nodes.contains(&node.name.as_str())) - .map(|node| { - let property_handlers = property_handlers(node); - quote! { - #(#property_handlers)* - } - }) - .collect() -} - -fn property_handlers(node: &Node) -> Vec { - node.fields - .iter() - .filter_map(|field| { - let field_name = format_ident!("{}", field.name.as_str()); - if field.field_type == FieldType::Node && field.repeated { - Some(quote! { - n.#field_name - .iter() - .for_each(|x| if x.node.is_some() { - handle_child(x.node.as_ref().unwrap().to_owned()); - }); - }) - } else if field.field_type == FieldType::Node && !field.is_one_of { - if field.node_name == Some("Node".to_owned()) { - Some(quote! { - if n.#field_name.is_some() { - handle_child(n.#field_name.to_owned().unwrap().node.unwrap()); - } - }) - } else { - let enum_variant_name = - format_ident!("{}", field.enum_variant_name.as_ref().unwrap().as_str()); - Some(quote! { - if n.#field_name.is_some() { - handle_child(NodeEnum::#enum_variant_name(n.#field_name.to_owned().unwrap())); - } - }) - } - } else { - None - } - }) - .collect() -} diff --git a/crates/pgt_query_ext_codegen/src/lib.rs b/crates/pgt_query_ext_codegen/src/lib.rs deleted file mode 100644 index c4f39c0e..00000000 --- a/crates/pgt_query_ext_codegen/src/lib.rs +++ /dev/null @@ -1,48 +0,0 @@ -mod get_location; -mod get_node_properties; -mod get_nodes; -mod node_iterator; - -use get_location::get_location_mod; -use get_node_properties::get_node_properties_mod; -use get_nodes::get_nodes_mod; -use node_iterator::node_iterator_mod; -use pgt_query_proto_parser::ProtoParser; -use quote::quote; -use std::{env, path, path::Path}; - -#[proc_macro] -pub fn codegen(_input: proc_macro::TokenStream) -> proc_macro::TokenStream { - let parser = ProtoParser::new(&proto_file_path()); - let proto_file = parser.parse(); - - let get_location = get_location_mod(&proto_file); - let get_node_properties = get_node_properties_mod(&proto_file); - let get_nodes = get_nodes_mod(&proto_file); - let iterator = node_iterator_mod(&proto_file); - - quote! { - use pgt_lexer::SyntaxKind; - use std::collections::VecDeque; - use pg_query::{protobuf, protobuf::ScanToken, protobuf::Token, NodeEnum, NodeRef}; - use std::cmp::{min, Ordering}; - use std::fmt::{Display, Formatter}; - use petgraph::stable_graph::{StableGraph}; - use petgraph::graph::{NodeIndex}; - - #get_location - #get_node_properties - #get_nodes - #iterator - } - .into() -} - -fn proto_file_path() -> path::PathBuf { - Path::new(env!("CARGO_MANIFEST_DIR")) - .ancestors() - .nth(2) - .unwrap() - .join("libpg_query/protobuf/pg_query.proto") - .to_path_buf() -} diff --git a/crates/pgt_query_ext_codegen/src/node_iterator.rs b/crates/pgt_query_ext_codegen/src/node_iterator.rs deleted file mode 100644 index 526966df..00000000 --- a/crates/pgt_query_ext_codegen/src/node_iterator.rs +++ /dev/null @@ -1,123 +0,0 @@ -use pgt_query_proto_parser::{FieldType, Node, ProtoFile}; -use proc_macro2::{Ident, TokenStream}; -use quote::{format_ident, quote}; - -pub fn node_iterator_mod(proto_file: &ProtoFile) -> proc_macro2::TokenStream { - let manual_node_names = manual_node_names(); - - let node_identifiers = node_identifiers(&proto_file.nodes, &manual_node_names); - let node_handlers = node_handlers(&proto_file.nodes, &manual_node_names); - - quote! { - #[derive(Debug, Clone)] - pub struct ChildrenIterator { - stack: VecDeque<(NodeEnum, usize)>, - nodes: Vec, - } - - impl ChildrenIterator { - pub fn new(root: NodeEnum) -> Self { - Self { - stack: VecDeque::from(vec![(root, 0)]), - nodes: Vec::new(), - } - } - } - - impl Iterator for ChildrenIterator { - type Item = NodeEnum; - - fn next(&mut self) -> Option { - if self.stack.is_empty() { - return None; - } - - let (node, depth) = self.stack.pop_front().unwrap(); - - let current_depth = depth + 1; - - match &node { - // `AConst` is the only node with a `one of` property, so we handle it manually - // if you need to handle other nodes manually, add them to the `manual_node_names` function below - NodeEnum::AConst(n) => { - // if n.val.is_some() { - // let new_node = match n.val.as_ref().unwrap() { - // pg_query::protobuf::a_const::Val::Ival(v) => Box::new(NodeEnum::Integer(v.clone())), - // pg_query::protobuf::a_const::Val::Fval(v) => Box::new(NodeEnum::Float(v.clone())), - // pg_query::protobuf::a_const::Val::Boolval(v) => Box::new(NodeEnum::Boolean(v.clone())), - // pg_query::protobuf::a_const::Val::Sval(v) => Box::new(NodeEnum::String(v.clone())), - // pg_query::protobuf::a_const::Val::Bsval(v) => Box::new(NodeEnum::BitString(v.clone())), - // }; - // self.stack.push_back((&new_node, current_depth)); - // self.boxed_nodes.push(new_node); - // } - } - #(NodeEnum::#node_identifiers(n) => {#node_handlers}),*, - }; - - Some(node) - } - } - } -} - -fn manual_node_names() -> Vec<&'static str> { - vec!["AConst"] -} - -fn node_identifiers(nodes: &[Node], exclude_nodes: &[&str]) -> Vec { - nodes - .iter() - .filter(|node| !exclude_nodes.contains(&node.name.as_str())) - .map(|node| format_ident!("{}", &node.name)) - .collect() -} - -fn node_handlers(nodes: &[Node], exclude_nodes: &[&str]) -> Vec { - nodes - .iter() - .filter(|node| !exclude_nodes.contains(&node.name.as_str())) - .map(|node| { - let property_handlers = property_handlers(node); - quote! { - #(#property_handlers)* - } - }) - .collect() -} - -fn property_handlers(node: &Node) -> Vec { - node.fields - .iter() - .filter_map(|field| { - let field_name = format_ident!("{}", field.name.as_str()); - if field.field_type == FieldType::Node && field.repeated { - Some(quote! { - n.#field_name - .iter() - .for_each(|x| if x.node.is_some() { - self.stack.push_back((x.node.as_ref().unwrap().to_owned(), current_depth)); - }); - }) - } else if field.field_type == FieldType::Node && !field.is_one_of { - if field.node_name == Some("Node".to_owned()) { - Some(quote! { - if n.#field_name.is_some() { - self.stack.push_back((n.#field_name.to_owned().unwrap().node.unwrap(), current_depth)); - } - }) - } else { - let enum_variant_name = - format_ident!("{}", field.enum_variant_name.as_ref().unwrap().as_str()); - Some(quote! { - if n.#field_name.is_some() { - self.stack.push_back((NodeEnum::#enum_variant_name(n.#field_name.to_owned().unwrap()), current_depth)); - } - }) - } - } else { - None - } - }) - .collect() -} diff --git a/crates/pgt_statement_splitter/src/diagnostics.rs b/crates/pgt_statement_splitter/src/diagnostics.rs index bcff6e80..d543d4e5 100644 --- a/crates/pgt_statement_splitter/src/diagnostics.rs +++ b/crates/pgt_statement_splitter/src/diagnostics.rs @@ -1,6 +1,9 @@ use pgt_diagnostics::{Diagnostic, MessageAndDescription}; +use pgt_lexer::{LexDiagnostic, Lexed}; use pgt_text_size::TextRange; +use crate::splitter::SplitError; + /// A specialized diagnostic for the statement splitter parser. /// /// Parser diagnostics are always **errors**. @@ -23,3 +26,22 @@ impl SplitDiagnostic { } } } + +impl From for SplitDiagnostic { + fn from(lex_diagnostic: LexDiagnostic) -> Self { + Self { + span: Some(lex_diagnostic.span), + message: lex_diagnostic.message, + } + } +} + +impl SplitDiagnostic { + pub fn from_split_error(split_error: SplitError, lexed: &Lexed) -> Self { + let range = lexed.range(split_error.token); + Self { + span: Some(range), + message: MessageAndDescription::from(split_error.msg), + } + } +} diff --git a/crates/pgt_statement_splitter/src/lib.rs b/crates/pgt_statement_splitter/src/lib.rs index c53ae78c..de028336 100644 --- a/crates/pgt_statement_splitter/src/lib.rs +++ b/crates/pgt_statement_splitter/src/lib.rs @@ -2,19 +2,40 @@ //! //! This crate provides a function to split a SQL source string into individual statements. pub mod diagnostics; -mod parser; +mod splitter; -use parser::{Parser, ParserResult, source}; -use pgt_lexer::diagnostics::ScanError; +use diagnostics::SplitDiagnostic; +use pgt_lexer::Lexer; +use pgt_text_size::TextRange; +use splitter::{Splitter, source}; -pub fn split(sql: &str) -> Result> { - let tokens = pgt_lexer::lex(sql)?; +pub struct SplitResult { + pub ranges: Vec, + pub errors: Vec, +} + +pub fn split(sql: &str) -> SplitResult { + let lexed = Lexer::new(sql).lex(); + + let mut splitter = Splitter::new(&lexed); + + source(&mut splitter); + + let split_result = splitter.finish(); - let mut parser = Parser::new(tokens); + let mut errors: Vec = lexed.errors().into_iter().map(Into::into).collect(); - source(&mut parser); + errors.extend( + split_result + .errors + .into_iter() + .map(|err| SplitDiagnostic::from_split_error(err, &lexed)), + ); - Ok(parser.finish()) + SplitResult { + ranges: split_result.ranges, + errors, + } } #[cfg(test)] @@ -28,13 +49,13 @@ mod tests { struct Tester { input: String, - parse: ParserResult, + result: SplitResult, } impl From<&str> for Tester { fn from(input: &str) -> Self { Tester { - parse: split(input).expect("Failed to split"), + result: split(input), input: input.to_string(), } } @@ -43,25 +64,25 @@ mod tests { impl Tester { fn expect_statements(&self, expected: Vec<&str>) -> &Self { assert_eq!( - self.parse.ranges.len(), + self.result.ranges.len(), expected.len(), "Expected {} statements for input {}, got {}: {:?}", expected.len(), self.input, - self.parse.ranges.len(), - self.parse + self.result.ranges.len(), + self.result .ranges .iter() .map(|r| &self.input[*r]) .collect::>() ); - for (range, expected) in self.parse.ranges.iter().zip(expected.iter()) { + for (range, expected) in self.result.ranges.iter().zip(expected.iter()) { assert_eq!(*expected, self.input[*range].to_string()); } assert!( - self.parse.ranges.is_sorted_by_key(|r| r.start()), + self.result.ranges.is_sorted_by_key(|r| r.start()), "Ranges are not sorted" ); @@ -70,15 +91,15 @@ mod tests { fn expect_errors(&self, expected: Vec) -> &Self { assert_eq!( - self.parse.errors.len(), + self.result.errors.len(), expected.len(), "Expected {} errors, got {}: {:?}", expected.len(), - self.parse.errors.len(), - self.parse.errors + self.result.errors.len(), + self.result.errors ); - for (err, expected) in self.parse.errors.iter().zip(expected.iter()) { + for (err, expected) in self.result.errors.iter().zip(expected.iter()) { assert_eq!(expected, err); } @@ -93,13 +114,6 @@ mod tests { ); } - #[test] - fn failing_lexer() { - let input = "select 1443ddwwd33djwdkjw13331333333333"; - let res = split(input).unwrap_err(); - assert!(!res.is_empty()); - } - #[test] #[timeout(1000)] fn basic() { @@ -161,7 +175,7 @@ mod tests { Tester::from("\ninsert select 1\n\nselect 3") .expect_statements(vec!["insert select 1", "select 3"]) .expect_errors(vec![SplitDiagnostic::new( - format!("Expected {:?}", SyntaxKind::Into), + format!("Expected {:?}", SyntaxKind::INTO_KW), TextRange::new(8.into(), 14.into()), )]); } diff --git a/crates/pgt_statement_splitter/src/parser.rs b/crates/pgt_statement_splitter/src/parser.rs deleted file mode 100644 index 241d0c70..00000000 --- a/crates/pgt_statement_splitter/src/parser.rs +++ /dev/null @@ -1,237 +0,0 @@ -mod common; -mod data; -mod ddl; -mod dml; - -pub use common::source; - -use pgt_lexer::{SyntaxKind, Token, WHITESPACE_TOKENS}; -use pgt_text_size::{TextRange, TextSize}; - -use crate::diagnostics::SplitDiagnostic; - -/// Main parser that exposes the `cstree` api, and collects errors and statements -/// It is modelled after a Pratt Parser. For a gentle introduction to Pratt Parsing, see https://matklad.github.io/2020/04/13/simple-but-powerful-pratt-parsing.html -pub struct Parser { - /// The statement ranges are defined by the indices of the start/end tokens - stmt_ranges: Vec<(usize, usize)>, - - /// The syntax errors accumulated during parsing - errors: Vec, - - current_stmt_start: Option, - - tokens: Vec, - - eof_token: Token, - - current_pos: usize, -} - -#[derive(Debug)] -pub struct ParserResult { - /// The ranges of the parsed statements - pub ranges: Vec, - /// The syntax errors accumulated during parsing - pub errors: Vec, -} - -impl Parser { - pub fn new(tokens: Vec) -> Self { - let eof_token = Token::eof(usize::from( - tokens - .last() - .map(|t| t.span.end()) - .unwrap_or(TextSize::from(0)), - )); - - // Place `current_pos` on the first relevant token - let mut current_pos = 0; - while is_irrelevant_token(tokens.get(current_pos).unwrap_or(&eof_token)) { - current_pos += 1; - } - - Self { - stmt_ranges: Vec::new(), - eof_token, - errors: Vec::new(), - current_stmt_start: None, - tokens, - current_pos, - } - } - - pub fn finish(self) -> ParserResult { - ParserResult { - ranges: self - .stmt_ranges - .iter() - .map(|(start_token_pos, end_token_pos)| { - let from = self.tokens.get(*start_token_pos); - let to = self.tokens.get(*end_token_pos).unwrap_or(&self.eof_token); - - TextRange::new(from.unwrap().span.start(), to.span.end()) - }) - .collect(), - errors: self.errors, - } - } - - pub fn start_stmt(&mut self) { - assert!( - self.current_stmt_start.is_none(), - "cannot start statement within statement at {:?}", - self.tokens.get(self.current_stmt_start.unwrap()) - ); - self.current_stmt_start = Some(self.current_pos); - } - - pub fn close_stmt(&mut self) { - assert!( - self.current_stmt_start.is_some(), - "Must start statement before closing it." - ); - - let start_token_pos = self.current_stmt_start.unwrap(); - - assert!( - self.current_pos > start_token_pos, - "Must close the statement on a token that's later than the start token." - ); - - let (end_token_pos, _) = self.find_last_relevant().unwrap(); - - self.stmt_ranges.push((start_token_pos, end_token_pos)); - - self.current_stmt_start = None; - } - - fn current(&self) -> &Token { - match self.tokens.get(self.current_pos) { - Some(token) => token, - None => &self.eof_token, - } - } - - /// Advances the parser to the next relevant token and returns it. - /// - /// NOTE: This will skip irrelevant tokens. - fn advance(&mut self) -> &Token { - // can't reuse any `find_next_relevant` logic because of Mr. Borrow Checker - let (pos, token) = self - .tokens - .iter() - .enumerate() - .skip(self.current_pos + 1) - .find(|(_, t)| is_relevant(t)) - .unwrap_or((self.tokens.len(), &self.eof_token)); - - self.current_pos = pos; - token - } - - fn look_ahead(&self) -> Option<&Token> { - self.tokens - .iter() - .skip(self.current_pos + 1) - .find(|t| is_relevant(t)) - } - - /// Returns `None` if there are no previous relevant tokens - fn look_back(&self) -> Option<&Token> { - self.find_last_relevant().map(|it| it.1) - } - - /// Will advance if the `kind` matches the current token. - /// Otherwise, will add a diagnostic to the internal `errors`. - pub fn expect(&mut self, kind: SyntaxKind) { - if self.current().kind == kind { - self.advance(); - } else { - self.errors.push(SplitDiagnostic::new( - format!("Expected {:#?}", kind), - self.current().span, - )); - } - } - - fn find_last_relevant(&self) -> Option<(usize, &Token)> { - self.tokens - .iter() - .enumerate() - .take(self.current_pos) - .rfind(|(_, t)| is_relevant(t)) - } -} - -#[cfg(windows)] -/// Returns true if the token is relevant for the parsing process -/// -/// On windows, a newline is represented by `\r\n` which is two characters. -fn is_irrelevant_token(t: &Token) -> bool { - WHITESPACE_TOKENS.contains(&t.kind) - // double new lines are relevant, single ones are not - && (t.kind != SyntaxKind::Newline || t.text == "\r\n" || t.text.chars().count() == 1) -} - -#[cfg(not(windows))] -/// Returns true if the token is relevant for the parsing process -fn is_irrelevant_token(t: &Token) -> bool { - WHITESPACE_TOKENS.contains(&t.kind) - // double new lines are relevant, single ones are not - && (t.kind != SyntaxKind::Newline || t.text.chars().count() == 1) -} - -fn is_relevant(t: &Token) -> bool { - !is_irrelevant_token(t) -} - -#[cfg(test)] -mod tests { - use pgt_lexer::SyntaxKind; - - use crate::parser::Parser; - - #[test] - fn advance_works_as_expected() { - let sql = r#" - create table users ( - id serial primary key, - name text, - email text - ); - "#; - let tokens = pgt_lexer::lex(sql).unwrap(); - let total_num_tokens = tokens.len(); - - let mut parser = Parser::new(tokens); - - let expected = vec![ - (SyntaxKind::Create, 2), - (SyntaxKind::Table, 4), - (SyntaxKind::Ident, 6), - (SyntaxKind::Ascii40, 8), - (SyntaxKind::Ident, 11), - (SyntaxKind::Ident, 13), - (SyntaxKind::Primary, 15), - (SyntaxKind::Key, 17), - (SyntaxKind::Ascii44, 18), - (SyntaxKind::NameP, 21), - (SyntaxKind::TextP, 23), - (SyntaxKind::Ascii44, 24), - (SyntaxKind::Ident, 27), - (SyntaxKind::TextP, 29), - (SyntaxKind::Ascii41, 32), - (SyntaxKind::Ascii59, 33), - ]; - - for (kind, pos) in expected { - assert_eq!(parser.current().kind, kind); - assert_eq!(parser.current_pos, pos); - parser.advance(); - } - - assert_eq!(parser.current().kind, SyntaxKind::Eof); - assert_eq!(parser.current_pos, total_num_tokens); - } -} diff --git a/crates/pgt_statement_splitter/src/parser/common.rs b/crates/pgt_statement_splitter/src/parser/common.rs deleted file mode 100644 index 4c4ab986..00000000 --- a/crates/pgt_statement_splitter/src/parser/common.rs +++ /dev/null @@ -1,307 +0,0 @@ -use pgt_lexer::{SyntaxKind, Token, TokenType, WHITESPACE_TOKENS}; - -use super::{ - Parser, - data::at_statement_start, - ddl::{alter, create}, - dml::{cte, delete, insert, select, update}, -}; - -pub fn source(p: &mut Parser) { - loop { - match p.current() { - Token { - kind: SyntaxKind::Eof, - .. - } => { - break; - } - Token { - // we might want to ignore TokenType::NoKeyword here too - // but this will lead to invalid statements to not being picked up - token_type: TokenType::Whitespace, - .. - } => { - p.advance(); - } - Token { - kind: SyntaxKind::Ascii92, - .. - } => { - plpgsql_command(p); - } - _ => { - statement(p); - } - } - } -} - -pub(crate) fn statement(p: &mut Parser) { - p.start_stmt(); - match p.current().kind { - SyntaxKind::With => { - cte(p); - } - SyntaxKind::Select => { - select(p); - } - SyntaxKind::Insert => { - insert(p); - } - SyntaxKind::Update => { - update(p); - } - SyntaxKind::DeleteP => { - delete(p); - } - SyntaxKind::Create => { - create(p); - } - SyntaxKind::Alter => { - alter(p); - } - _ => { - unknown(p, &[]); - } - } - p.close_stmt(); -} - -pub(crate) fn parenthesis(p: &mut Parser) { - p.expect(SyntaxKind::Ascii40); - - let mut depth = 1; - - loop { - match p.current().kind { - SyntaxKind::Ascii40 => { - p.advance(); - depth += 1; - } - SyntaxKind::Ascii41 | SyntaxKind::Eof => { - p.advance(); - depth -= 1; - if depth == 0 { - break; - } - } - _ => { - p.advance(); - } - } - } -} - -pub(crate) fn plpgsql_command(p: &mut Parser) { - p.expect(SyntaxKind::Ascii92); - - loop { - match p.current().kind { - SyntaxKind::Newline => { - p.advance(); - break; - } - _ => { - // advance the parser to the next token without ignoring irrelevant tokens - // we would skip a newline with `advance()` - p.current_pos += 1; - } - } - } -} - -pub(crate) fn case(p: &mut Parser) { - p.expect(SyntaxKind::Case); - - loop { - match p.current().kind { - SyntaxKind::EndP => { - p.advance(); - break; - } - _ => { - p.advance(); - } - } - } -} - -pub(crate) fn unknown(p: &mut Parser, exclude: &[SyntaxKind]) { - loop { - match p.current() { - Token { - kind: SyntaxKind::Ascii59, - .. - } => { - p.advance(); - break; - } - Token { - kind: SyntaxKind::Eof, - .. - } => { - break; - } - Token { - kind: SyntaxKind::Newline, - .. - } => { - if p.look_back().is_some_and(|t| t.kind == SyntaxKind::Ascii44) { - p.advance(); - } else { - break; - } - } - Token { - kind: SyntaxKind::Case, - .. - } => { - case(p); - } - Token { - kind: SyntaxKind::Ascii92, - .. - } => { - // pgsql commands e.g. - // - // ``` - // \if test - // ``` - // - // we wait for "\" and check if the previous token is a newline - - // newline is a whitespace, but we do not want to ignore it here - let irrelevant = WHITESPACE_TOKENS - .iter() - .filter(|t| **t != SyntaxKind::Newline) - .collect::>(); - - // go back from the current position without ignoring irrelevant tokens - if p.tokens - .iter() - .take(p.current_pos) - .rev() - .find(|t| !irrelevant.contains(&&t.kind)) - .is_some_and(|t| t.kind == SyntaxKind::Newline) - { - break; - } - p.advance(); - } - Token { - kind: SyntaxKind::Ascii40, - .. - } => { - parenthesis(p); - } - t => match at_statement_start(t.kind, exclude) { - Some(SyntaxKind::Select) => { - let prev = p.look_back().map(|t| t.kind); - if [ - // for policies, with for select - SyntaxKind::For, - // for create view / table as - SyntaxKind::As, - // for create rule - SyntaxKind::On, - // for create rule - SyntaxKind::Also, - // for create rule - SyntaxKind::Instead, - // for UNION - SyntaxKind::Union, - // for UNION ALL - SyntaxKind::All, - // for UNION ... EXCEPT - SyntaxKind::Except, - // for grant - SyntaxKind::Grant, - SyntaxKind::Ascii44, - ] - .iter() - .all(|x| Some(x) != prev.as_ref()) - { - break; - } - - p.advance(); - } - Some(SyntaxKind::Insert) | Some(SyntaxKind::Update) | Some(SyntaxKind::DeleteP) => { - let prev = p.look_back().map(|t| t.kind); - if [ - // for create trigger - SyntaxKind::Before, - SyntaxKind::After, - // for policies, e.g. for insert - SyntaxKind::For, - // e.g. on insert or delete - SyntaxKind::Or, - // e.g. INSTEAD OF INSERT - SyntaxKind::Of, - // for create rule - SyntaxKind::On, - // for create rule - SyntaxKind::Also, - // for create rule - SyntaxKind::Instead, - // for grant - SyntaxKind::Grant, - SyntaxKind::Ascii44, - // Do update in INSERT stmt - SyntaxKind::Do, - ] - .iter() - .all(|x| Some(x) != prev.as_ref()) - { - break; - } - p.advance(); - } - Some(SyntaxKind::With) => { - let next = p.look_ahead().map(|t| t.kind); - if [ - // WITH ORDINALITY should not start a new statement - SyntaxKind::Ordinality, - // WITH CHECK should not start a new statement - SyntaxKind::Check, - // TIMESTAMP WITH TIME ZONE should not start a new statement - SyntaxKind::Time, - SyntaxKind::Grant, - SyntaxKind::Admin, - SyntaxKind::Inherit, - SyntaxKind::Set, - ] - .iter() - .all(|x| Some(x) != next.as_ref()) - { - break; - } - p.advance(); - } - - Some(SyntaxKind::Create) => { - let prev = p.look_back().map(|t| t.kind); - if [ - // for grant - SyntaxKind::Grant, - SyntaxKind::Ascii44, - ] - .iter() - .all(|x| Some(x) != prev.as_ref()) - { - break; - } - - p.advance(); - } - Some(_) => { - break; - } - None => { - p.advance(); - } - }, - } - } -} diff --git a/crates/pgt_statement_splitter/src/parser/ddl.rs b/crates/pgt_statement_splitter/src/parser/ddl.rs deleted file mode 100644 index d9f233c2..00000000 --- a/crates/pgt_statement_splitter/src/parser/ddl.rs +++ /dev/null @@ -1,15 +0,0 @@ -use pgt_lexer::SyntaxKind; - -use super::{Parser, common::unknown}; - -pub(crate) fn create(p: &mut Parser) { - p.expect(SyntaxKind::Create); - - unknown(p, &[SyntaxKind::With]); -} - -pub(crate) fn alter(p: &mut Parser) { - p.expect(SyntaxKind::Alter); - - unknown(p, &[SyntaxKind::Alter]); -} diff --git a/crates/pgt_statement_splitter/src/parser/dml.rs b/crates/pgt_statement_splitter/src/parser/dml.rs deleted file mode 100644 index 015c50b6..00000000 --- a/crates/pgt_statement_splitter/src/parser/dml.rs +++ /dev/null @@ -1,59 +0,0 @@ -use pgt_lexer::SyntaxKind; - -use super::{ - Parser, - common::{parenthesis, unknown}, -}; - -pub(crate) fn cte(p: &mut Parser) { - p.expect(SyntaxKind::With); - - loop { - p.expect(SyntaxKind::Ident); - p.expect(SyntaxKind::As); - parenthesis(p); - - if p.current().kind == SyntaxKind::Ascii44 { - p.advance(); - } else { - break; - } - } - - unknown( - p, - &[ - SyntaxKind::Select, - SyntaxKind::Insert, - SyntaxKind::Update, - SyntaxKind::DeleteP, - SyntaxKind::Merge, - ], - ); -} - -pub(crate) fn select(p: &mut Parser) { - p.expect(SyntaxKind::Select); - - unknown(p, &[]); -} - -pub(crate) fn insert(p: &mut Parser) { - p.expect(SyntaxKind::Insert); - p.expect(SyntaxKind::Into); - - unknown(p, &[SyntaxKind::Select]); -} - -pub(crate) fn update(p: &mut Parser) { - p.expect(SyntaxKind::Update); - - unknown(p, &[]); -} - -pub(crate) fn delete(p: &mut Parser) { - p.expect(SyntaxKind::DeleteP); - p.expect(SyntaxKind::From); - - unknown(p, &[]); -} diff --git a/crates/pgt_statement_splitter/src/splitter.rs b/crates/pgt_statement_splitter/src/splitter.rs new file mode 100644 index 00000000..cfb4716d --- /dev/null +++ b/crates/pgt_statement_splitter/src/splitter.rs @@ -0,0 +1,168 @@ +mod common; +mod data; +mod ddl; +mod dml; + +pub use common::source; + +use pgt_lexer::{Lexed, SyntaxKind}; +use pgt_text_size::TextRange; + +pub struct SplitResult { + pub ranges: Vec, + pub errors: Vec, +} + +pub static TRIVIA_TOKENS: &[SyntaxKind] = &[ + SyntaxKind::SPACE, + SyntaxKind::TAB, + SyntaxKind::VERTICAL_TAB, + SyntaxKind::FORM_FEED, + SyntaxKind::COMMENT, + // LINE_ENDING is relevant +]; + +/// Internal error type used during splitting +#[derive(Debug, Clone)] +pub struct SplitError { + pub msg: String, + pub token: usize, +} + +pub struct Splitter<'a> { + lexed: &'a Lexed<'a>, + current_pos: usize, + stmt_ranges: Vec<(usize, usize)>, + errors: Vec, + current_stmt_start: Option, +} + +impl<'a> Splitter<'a> { + pub fn new(lexed: &'a Lexed<'a>) -> Self { + Self { + lexed, + current_pos: 0, + stmt_ranges: Vec::new(), + errors: Vec::new(), + current_stmt_start: None, + } + } + + pub fn finish(self) -> SplitResult { + let ranges = self + .stmt_ranges + .iter() + .map(|(start_token_pos, end_token_pos)| { + let from = self.lexed.range(*start_token_pos).start(); + let to = self.lexed.range(*end_token_pos).end(); + TextRange::new(from, to) + }) + .collect(); + + SplitResult { + ranges, + errors: self.errors, + } + } + + pub fn start_stmt(&mut self) { + assert!( + self.current_stmt_start.is_none(), + "cannot start statement within statement", + ); + self.current_stmt_start = Some(self.current_pos); + } + + pub fn close_stmt(&mut self) { + assert!( + self.current_stmt_start.is_some(), + "Must start statement before closing it." + ); + + let start_token_pos = self.current_stmt_start.unwrap(); + + assert!( + self.current_pos > start_token_pos, + "Must close the statement on a token that's later than the start token: {} > {}", + self.current_pos, + start_token_pos + ); + + let end_token_pos = (0..self.current_pos) + .rev() + .find(|&idx| !self.is_trivia(idx)) + .unwrap(); + + self.stmt_ranges.push((start_token_pos, end_token_pos)); + + self.current_stmt_start = None; + } + + fn current(&self) -> SyntaxKind { + self.lexed.kind(self.current_pos) + } + + fn kind(&self, idx: usize) -> SyntaxKind { + self.lexed.kind(idx) + } + + /// Advances the parser to the next relevant token and returns it. + /// + /// NOTE: This will skip trivia tokens. + fn advance(&mut self) -> SyntaxKind { + let pos = (self.current_pos + 1..self.lexed.len()) + .find(|&idx| !self.is_trivia(idx)) + .expect("lexed should have non-trivia eof token"); + + self.current_pos = pos; + self.lexed.kind(pos) + } + + fn look_ahead(&self, ignore_trivia: bool) -> SyntaxKind { + let pos = if ignore_trivia { + (self.current_pos + 1..self.lexed.len()) + .find(|&idx| !self.is_trivia(idx)) + .expect("lexed should have non-trivia eof token") + } else { + (self.current_pos + 1..self.lexed.len()) + .next() + .expect("lexed should have a eof token") + }; + self.lexed.kind(pos) + } + + /// Returns `None` if there are no previous relevant tokens + fn look_back(&self, ignore_trivia: bool) -> Option { + if ignore_trivia { + (0..self.current_pos) + .rev() + .find(|&idx| !self.is_trivia(idx)) + .map(|idx| self.lexed.kind(idx)) + } else { + (0..self.current_pos) + .next_back() + .map(|idx| self.lexed.kind(idx)) + } + } + + fn is_trivia(&self, idx: usize) -> bool { + match self.lexed.kind(idx) { + k if TRIVIA_TOKENS.contains(&k) => true, + SyntaxKind::LINE_ENDING => self.lexed.line_ending_count(idx) < 2, + _ => false, + } + } + + /// Will advance if the `kind` matches the current token. + /// Otherwise, will add a diagnostic to the internal `errors`. + fn expect(&mut self, kind: SyntaxKind) { + if self.current() == kind { + self.advance(); + } else { + self.errors.push(SplitError { + msg: format!("Expected {:#?}", kind), + token: self.current_pos, + }); + } + } +} diff --git a/crates/pgt_statement_splitter/src/splitter/common.rs b/crates/pgt_statement_splitter/src/splitter/common.rs new file mode 100644 index 00000000..4f2cd069 --- /dev/null +++ b/crates/pgt_statement_splitter/src/splitter/common.rs @@ -0,0 +1,275 @@ +use super::TRIVIA_TOKENS; +use pgt_lexer::SyntaxKind; + +use super::{ + Splitter, + data::at_statement_start, + ddl::{alter, create}, + dml::{cte, delete, insert, select, update}, +}; + +pub fn source(p: &mut Splitter) { + loop { + match p.current() { + SyntaxKind::EOF => { + break; + } + kind if TRIVIA_TOKENS.contains(&kind) || kind == SyntaxKind::LINE_ENDING => { + p.advance(); + } + SyntaxKind::BACKSLASH => { + plpgsql_command(p); + } + _ => { + statement(p); + } + } + } +} + +pub(crate) fn statement(p: &mut Splitter) { + p.start_stmt(); + match p.current() { + SyntaxKind::WITH_KW => { + cte(p); + } + SyntaxKind::SELECT_KW => { + select(p); + } + SyntaxKind::INSERT_KW => { + insert(p); + } + SyntaxKind::UPDATE_KW => { + update(p); + } + SyntaxKind::DELETE_KW => { + delete(p); + } + SyntaxKind::CREATE_KW => { + create(p); + } + SyntaxKind::ALTER_KW => { + alter(p); + } + _ => { + unknown(p, &[]); + } + } + p.close_stmt(); +} + +pub(crate) fn parenthesis(p: &mut Splitter) { + p.expect(SyntaxKind::L_PAREN); + + let mut depth = 1; + + loop { + match p.current() { + SyntaxKind::L_PAREN => { + p.advance(); + depth += 1; + } + SyntaxKind::R_PAREN | SyntaxKind::EOF => { + p.advance(); + depth -= 1; + if depth == 0 { + break; + } + } + _ => { + p.advance(); + } + } + } +} + +pub(crate) fn plpgsql_command(p: &mut Splitter) { + p.expect(SyntaxKind::BACKSLASH); + + loop { + match p.current() { + SyntaxKind::LINE_ENDING => { + p.advance(); + break; + } + _ => { + // advance the splitter to the next token without ignoring irrelevant tokens + // we would skip a newline with `advance()` + p.current_pos += 1; + } + } + } +} + +pub(crate) fn case(p: &mut Splitter) { + p.expect(SyntaxKind::CASE_KW); + + loop { + match p.current() { + SyntaxKind::END_KW => { + p.advance(); + break; + } + _ => { + p.advance(); + } + } + } +} + +pub(crate) fn unknown(p: &mut Splitter, exclude: &[SyntaxKind]) { + loop { + match p.current() { + SyntaxKind::SEMICOLON => { + p.advance(); + break; + } + SyntaxKind::EOF => { + break; + } + SyntaxKind::LINE_ENDING => { + if p.look_back(true).is_some_and(|t| t == SyntaxKind::COMMA) { + p.advance(); + } else { + break; + } + } + SyntaxKind::CASE_KW => { + case(p); + } + SyntaxKind::BACKSLASH => { + // pgsql commands + // we want to check if the previous token non-trivia token is a LINE_ENDING + // we cannot use the is_trivia() method because that would exclude LINE_ENDINGs + // with count > 1 + if (0..p.current_pos) + .rev() + .find_map(|idx| { + let kind = p.kind(idx); + if !TRIVIA_TOKENS.contains(&kind) { + Some(kind) + } else { + None + } + }) + .is_some_and(|t| t == SyntaxKind::LINE_ENDING) + { + break; + } + p.advance(); + } + SyntaxKind::L_PAREN => { + parenthesis(p); + } + t => match at_statement_start(t, exclude) { + Some(SyntaxKind::SELECT_KW) => { + let prev = p.look_back(true); + if [ + // for policies, with for select + SyntaxKind::FOR_KW, + // for create view / table as + SyntaxKind::AS_KW, + // for create rule + SyntaxKind::ON_KW, + // for create rule + SyntaxKind::ALSO_KW, + // for create rule + SyntaxKind::INSTEAD_KW, + // for UNION + SyntaxKind::UNION_KW, + // for UNION ALL + SyntaxKind::ALL_KW, + // for UNION ... EXCEPT + SyntaxKind::EXCEPT_KW, + // for grant + SyntaxKind::GRANT_KW, + SyntaxKind::COMMA, + ] + .iter() + .all(|x| Some(x) != prev.as_ref()) + { + break; + } + + p.advance(); + } + Some(SyntaxKind::INSERT_KW) + | Some(SyntaxKind::UPDATE_KW) + | Some(SyntaxKind::DELETE_KW) => { + let prev = p.look_back(true); + if [ + // for create trigger + SyntaxKind::BEFORE_KW, + SyntaxKind::AFTER_KW, + // for policies, e.g. for insert + SyntaxKind::FOR_KW, + // e.g. on insert or delete + SyntaxKind::OR_KW, + // e.g. INSTEAD OF INSERT + SyntaxKind::OF_KW, + // for create rule + SyntaxKind::ON_KW, + // for create rule + SyntaxKind::ALSO_KW, + // for create rule + SyntaxKind::INSTEAD_KW, + // for grant + SyntaxKind::GRANT_KW, + SyntaxKind::COMMA, + // Do update in INSERT stmt + SyntaxKind::DO_KW, + ] + .iter() + .all(|x| Some(x) != prev.as_ref()) + { + break; + } + p.advance(); + } + Some(SyntaxKind::WITH_KW) => { + let next = p.look_ahead(true); + if [ + // WITH ORDINALITY should not start a new statement + SyntaxKind::ORDINALITY_KW, + // WITH CHECK should not start a new statement + SyntaxKind::CHECK_KW, + // TIMESTAMP WITH TIME ZONE should not start a new statement + SyntaxKind::TIME_KW, + SyntaxKind::GRANT_KW, + SyntaxKind::ADMIN_KW, + SyntaxKind::INHERIT_KW, + SyntaxKind::SET_KW, + ] + .iter() + .all(|x| x != &next) + { + break; + } + p.advance(); + } + + Some(SyntaxKind::CREATE_KW) => { + let prev = p.look_back(true); + if [ + // for grant + SyntaxKind::GRANT_KW, + SyntaxKind::COMMA, + ] + .iter() + .all(|x| Some(x) != prev.as_ref()) + { + break; + } + + p.advance(); + } + Some(_) => { + break; + } + None => { + p.advance(); + } + }, + } + } +} diff --git a/crates/pgt_statement_splitter/src/parser/data.rs b/crates/pgt_statement_splitter/src/splitter/data.rs similarity index 62% rename from crates/pgt_statement_splitter/src/parser/data.rs rename to crates/pgt_statement_splitter/src/splitter/data.rs index c0792c39..0827484b 100644 --- a/crates/pgt_statement_splitter/src/parser/data.rs +++ b/crates/pgt_statement_splitter/src/splitter/data.rs @@ -3,15 +3,15 @@ use pgt_lexer::SyntaxKind; // All tokens listed here must be explicitly handled in the `unknown` function to ensure that we do // not break in the middle of another statement that contains a statement start token. // -// All of these statements must have a dedicated parser function called from the `statement` function +// All of these statements must have a dedicated splitter function called from the `statement` function static STATEMENT_START_TOKENS: &[SyntaxKind] = &[ - SyntaxKind::With, - SyntaxKind::Select, - SyntaxKind::Insert, - SyntaxKind::Update, - SyntaxKind::DeleteP, - SyntaxKind::Create, - SyntaxKind::Alter, + SyntaxKind::WITH_KW, + SyntaxKind::SELECT_KW, + SyntaxKind::INSERT_KW, + SyntaxKind::UPDATE_KW, + SyntaxKind::DELETE_KW, + SyntaxKind::CREATE_KW, + SyntaxKind::ALTER_KW, ]; pub(crate) fn at_statement_start(kind: SyntaxKind, exclude: &[SyntaxKind]) -> Option<&SyntaxKind> { diff --git a/crates/pgt_statement_splitter/src/splitter/ddl.rs b/crates/pgt_statement_splitter/src/splitter/ddl.rs new file mode 100644 index 00000000..449288aa --- /dev/null +++ b/crates/pgt_statement_splitter/src/splitter/ddl.rs @@ -0,0 +1,15 @@ +use pgt_lexer::SyntaxKind; + +use super::{Splitter, common::unknown}; + +pub(crate) fn create(p: &mut Splitter) { + p.expect(SyntaxKind::CREATE_KW); + + unknown(p, &[SyntaxKind::WITH_KW]); +} + +pub(crate) fn alter(p: &mut Splitter) { + p.expect(SyntaxKind::ALTER_KW); + + unknown(p, &[SyntaxKind::ALTER_KW]); +} diff --git a/crates/pgt_statement_splitter/src/splitter/dml.rs b/crates/pgt_statement_splitter/src/splitter/dml.rs new file mode 100644 index 00000000..9c833301 --- /dev/null +++ b/crates/pgt_statement_splitter/src/splitter/dml.rs @@ -0,0 +1,59 @@ +use pgt_lexer::SyntaxKind; + +use super::{ + Splitter, + common::{parenthesis, unknown}, +}; + +pub(crate) fn cte(p: &mut Splitter) { + p.expect(SyntaxKind::WITH_KW); + + loop { + p.expect(SyntaxKind::IDENT); + p.expect(SyntaxKind::AS_KW); + parenthesis(p); + + if p.current() == SyntaxKind::COMMA { + p.advance(); + } else { + break; + } + } + + unknown( + p, + &[ + SyntaxKind::SELECT_KW, + SyntaxKind::INSERT_KW, + SyntaxKind::UPDATE_KW, + SyntaxKind::DELETE_KW, + SyntaxKind::MERGE_KW, + ], + ); +} + +pub(crate) fn select(p: &mut Splitter) { + p.expect(SyntaxKind::SELECT_KW); + + unknown(p, &[]); +} + +pub(crate) fn insert(p: &mut Splitter) { + p.expect(SyntaxKind::INSERT_KW); + p.expect(SyntaxKind::INTO_KW); + + unknown(p, &[SyntaxKind::SELECT_KW]); +} + +pub(crate) fn update(p: &mut Splitter) { + p.expect(SyntaxKind::UPDATE_KW); + + unknown(p, &[]); +} + +pub(crate) fn delete(p: &mut Splitter) { + p.expect(SyntaxKind::DELETE_KW); + p.expect(SyntaxKind::FROM_KW); + + unknown(p, &[]); +} diff --git a/crates/pgt_statement_splitter/tests/statement_splitter_tests.rs b/crates/pgt_statement_splitter/tests/statement_splitter_tests.rs index e0534725..a4cf3259 100644 --- a/crates/pgt_statement_splitter/tests/statement_splitter_tests.rs +++ b/crates/pgt_statement_splitter/tests/statement_splitter_tests.rs @@ -22,7 +22,7 @@ fn test_statement_splitter() { let contents = fs::read_to_string(&path).unwrap(); - let split = pgt_statement_splitter::split(&contents).expect("Failed to split"); + let split = pgt_statement_splitter::split(&contents); assert_eq!( split.ranges.len(), diff --git a/crates/pgt_query_ext_codegen/Cargo.toml b/crates/pgt_tokenizer/Cargo.toml similarity index 62% rename from crates/pgt_query_ext_codegen/Cargo.toml rename to crates/pgt_tokenizer/Cargo.toml index c3a0f20d..9cd4bf5e 100644 --- a/crates/pgt_query_ext_codegen/Cargo.toml +++ b/crates/pgt_tokenizer/Cargo.toml @@ -6,17 +6,14 @@ edition.workspace = true homepage.workspace = true keywords.workspace = true license.workspace = true -name = "pgt_query_ext_codegen" +name = "pgt_tokenizer" repository.workspace = true version = "0.0.0" [dependencies] -proc-macro2.workspace = true -quote.workspace = true -pgt_query_proto_parser.workspace = true +[dev-dependencies] +insta.workspace = true [lib] -doctest = false -proc-macro = true diff --git a/crates/pgt_tokenizer/README.md b/crates/pgt_tokenizer/README.md new file mode 100644 index 00000000..8fc21d34 --- /dev/null +++ b/crates/pgt_tokenizer/README.md @@ -0,0 +1 @@ +Heavily inspired by and copied from [squawk_lexer](https://github.com/sbdchd/squawk/tree/9acfecbbb7f3c7eedcbaf060e7b25f9afa136db3/crates/squawk_lexer). Thanks for making all the hard work MIT-licensed! diff --git a/crates/pgt_tokenizer/src/cursor.rs b/crates/pgt_tokenizer/src/cursor.rs new file mode 100644 index 00000000..64710f29 --- /dev/null +++ b/crates/pgt_tokenizer/src/cursor.rs @@ -0,0 +1,73 @@ +use std::str::Chars; + +/// Peekable iterator over a char sequence. +/// +/// Next characters can be peeked via `first` method, +/// and position can be shifted forward via `bump` method. +/// based on: +/// - +/// - +/// +pub(crate) struct Cursor<'a> { + /// Iterator over chars. Slightly faster than a &str. + chars: Chars<'a>, + len_remaining: usize, +} + +pub(crate) const EOF_CHAR: char = '\0'; + +impl<'a> Cursor<'a> { + pub(crate) fn new(input: &'a str) -> Cursor<'a> { + Cursor { + len_remaining: input.len(), + chars: input.chars(), + } + } + + /// Peeks the next symbol from the input stream without consuming it. + /// If requested position doesn't exist, `EOF_CHAR` is returned. + /// However, getting `EOF_CHAR` doesn't always mean actual end of file, + /// it should be checked with `is_eof` method. + pub(crate) fn first(&self) -> char { + // `.next()` optimizes better than `.nth(0)` + self.chars.clone().next().unwrap_or(EOF_CHAR) + } + + /// Peeks the second next symbol from the input stream without consuming it. + /// If requested position doesn't exist, `EOF_CHAR` is returned. + /// However, getting `EOF_CHAR` doesn't always mean actual end of file, + /// it should be checked with `is_eof` method. + pub(crate) fn second(&self) -> char { + self.chars.clone().nth(1).unwrap_or(EOF_CHAR) + } + + /// Checks if there is nothing more to consume. + pub(crate) fn is_eof(&self) -> bool { + self.chars.as_str().is_empty() + } + + /// Returns amount of already consumed symbols. + pub(crate) fn pos_within_token(&self) -> u32 { + (self.len_remaining - self.chars.as_str().len()) as u32 + } + + /// Resets the number of bytes consumed to 0. + pub(crate) fn reset_pos_within_token(&mut self) { + self.len_remaining = self.chars.as_str().len(); + } + + /// Moves to the next character. + pub(crate) fn bump(&mut self) -> Option { + let c = self.chars.next()?; + Some(c) + } + + /// Eats symbols while predicate returns true or until the end of file is reached. + pub(crate) fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) { + // It was tried making optimized version of this for eg. line comments, but + // LLVM can inline all of this and compile it down to fast iteration over bytes. + while predicate(self.first()) && !self.is_eof() { + self.bump(); + } + } +} diff --git a/crates/pgt_tokenizer/src/lib.rs b/crates/pgt_tokenizer/src/lib.rs new file mode 100644 index 00000000..787adcaa --- /dev/null +++ b/crates/pgt_tokenizer/src/lib.rs @@ -0,0 +1,830 @@ +mod cursor; +mod token; +use cursor::{Cursor, EOF_CHAR}; +pub use token::{Base, LiteralKind, Token, TokenKind}; + +// via: https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L346 +// ident_start [A-Za-z\200-\377_] +const fn is_ident_start(c: char) -> bool { + matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '\u{80}'..='\u{FF}') +} + +// ident_cont [A-Za-z\200-\377_0-9\$] +const fn is_ident_cont(c: char) -> bool { + matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9' | '$' | '\u{80}'..='\u{FF}') +} + +// whitespace +// - https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scansup.c#L107-L128 +// - https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L204-L229 + +const fn is_space(c: char) -> bool { + matches!( + c, ' ' // space + ) +} + +const fn is_tab(c: char) -> bool { + matches!( + c, '\t' // tab + ) +} + +const fn is_line_ending(c: char) -> bool { + matches!( + c, + '\n' | '\r' // newline or carriage return + ) +} + +const fn is_vertical_tab(c: char) -> bool { + matches!( + c, '\u{000B}' // vertical tab + ) +} + +const fn is_form_feed(c: char) -> bool { + matches!( + c, '\u{000C}' // form feed + ) +} + +impl Cursor<'_> { + // see: https://github.com/rust-lang/rust/blob/ba1d7f4a083e6402679105115ded645512a7aea8/compiler/rustc_lexer/src/lib.rs#L339 + pub(crate) fn advance_token(&mut self) -> Token { + let Some(first_char) = self.bump() else { + return Token::new(TokenKind::Eof, 0); + }; + let token_kind = match first_char { + // Slash, comment or block comment. + '/' => match self.first() { + '*' => self.block_comment(), + _ => TokenKind::Slash, + }, + '-' => match self.first() { + '-' => self.line_comment(), + _ => TokenKind::Minus, + }, + + c if is_space(c) => { + self.eat_while(is_space); + TokenKind::Space + } + + c if is_tab(c) => { + self.eat_while(is_tab); + TokenKind::Tab + } + + c if is_line_ending(c) => self.line_ending_sequence(c), + + c if is_vertical_tab(c) => { + self.eat_while(is_vertical_tab); + TokenKind::VerticalTab + } + + c if is_form_feed(c) => { + self.eat_while(is_form_feed); + TokenKind::FormFeed + } + + // https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE + 'u' | 'U' => match self.first() { + '&' => { + self.bump(); + self.prefixed_string( + |terminated| LiteralKind::UnicodeEscStr { terminated }, + true, + ) + } + _ => self.ident_or_unknown_prefix(), + }, + + // escaped strings + 'e' | 'E' => { + self.prefixed_string(|terminated| LiteralKind::EscStr { terminated }, false) + } + + // bit string + 'b' | 'B' => { + self.prefixed_string(|terminated| LiteralKind::BitStr { terminated }, false) + } + + // hexadecimal byte string + 'x' | 'X' => { + self.prefixed_string(|terminated| LiteralKind::ByteStr { terminated }, false) + } + + // Identifier (this should be checked after other variant that can + // start as identifier). + c if is_ident_start(c) => self.ident(), + + // Numeric literal. + // see: https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-CONSTANTS-NUMERIC + c @ '0'..='9' => { + let literal_kind = self.number(c); + TokenKind::Literal { kind: literal_kind } + } + '.' => match self.first() { + '0'..='9' => { + let literal_kind = self.number('.'); + TokenKind::Literal { kind: literal_kind } + } + _ => TokenKind::Dot, + }, + // One-symbol tokens. + ';' => TokenKind::Semi, + '\\' => TokenKind::Backslash, + ',' => TokenKind::Comma, + '(' => TokenKind::OpenParen, + ')' => TokenKind::CloseParen, + '[' => TokenKind::OpenBracket, + ']' => TokenKind::CloseBracket, + '@' => TokenKind::At, + '#' => TokenKind::Pound, + '~' => TokenKind::Tilde, + '?' => TokenKind::Question, + ':' => TokenKind::Colon, + '$' => { + // Dollar quoted strings + if is_ident_start(self.first()) || self.first() == '$' { + self.dollar_quoted_string() + } else { + // Parameters + while self.first().is_ascii_digit() { + self.bump(); + } + TokenKind::PositionalParam + } + } + '`' => TokenKind::Backtick, + '=' => TokenKind::Eq, + '!' => TokenKind::Bang, + '<' => TokenKind::Lt, + '>' => TokenKind::Gt, + '&' => TokenKind::And, + '|' => TokenKind::Or, + '+' => TokenKind::Plus, + '*' => TokenKind::Star, + '^' => TokenKind::Caret, + '%' => TokenKind::Percent, + + // String literal + '\'' => { + let terminated = self.single_quoted_string(); + let kind = LiteralKind::Str { terminated }; + TokenKind::Literal { kind } + } + + // Quoted indentifiers + '"' => { + let terminated = self.double_quoted_string(); + TokenKind::QuotedIdent { terminated } + } + _ => TokenKind::Unknown, + }; + let res = Token::new(token_kind, self.pos_within_token()); + self.reset_pos_within_token(); + res + } + pub(crate) fn ident(&mut self) -> TokenKind { + self.eat_while(is_ident_cont); + TokenKind::Ident + } + + fn ident_or_unknown_prefix(&mut self) -> TokenKind { + // Start is already eaten, eat the rest of identifier. + self.eat_while(is_ident_cont); + // Known prefixes must have been handled earlier. So if + // we see a prefix here, it is definitely an unknown prefix. + match self.first() { + '#' | '"' | '\'' => TokenKind::UnknownPrefix, + _ => TokenKind::Ident, + } + } + + // see: https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L227 + // comment ("--"{non_newline}*) + pub(crate) fn line_comment(&mut self) -> TokenKind { + self.bump(); + + self.eat_while(|c| c != '\n'); + TokenKind::LineComment + } + + // see: https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L324-L344 + pub(crate) fn block_comment(&mut self) -> TokenKind { + self.bump(); + + let mut depth = 1usize; + while let Some(c) = self.bump() { + match c { + '/' if self.first() == '*' => { + self.bump(); + depth += 1; + } + '*' if self.first() == '/' => { + self.bump(); + depth -= 1; + if depth == 0 { + // This block comment is closed, so for a construction like "/* */ */" + // there will be a successfully parsed block comment "/* */" + // and " */" will be processed separately. + break; + } + } + _ => (), + } + } + + TokenKind::BlockComment { + terminated: depth == 0, + } + } + + // invariant: we care about the number of consecutive newlines so we count them. + // + // Postgres considers a DOS-style \r\n sequence as two successive newlines, but we care about + // logical line breaks and consider \r\n as one logical line break + fn line_ending_sequence(&mut self, prev: char) -> TokenKind { + // already consumed first line ending character (\n or \r) + let mut line_breaks = 1; + + // started with \r, check if it's part of \r\n + if prev == '\r' && self.first() == '\n' { + // consume the \n - \r\n still counts as 1 logical line break + self.bump(); + } + + // continue checking for more line endings + loop { + match self.first() { + '\r' if self.second() == '\n' => { + self.bump(); // consume \r + self.bump(); // consume \n + line_breaks += 1; + } + '\n' => { + self.bump(); + line_breaks += 1; + } + '\r' => { + self.bump(); + line_breaks += 1; + } + _ => break, + } + } + + TokenKind::LineEnding { count: line_breaks } + } + + fn prefixed_string( + &mut self, + mk_kind: fn(bool) -> LiteralKind, + allows_double: bool, + ) -> TokenKind { + match self.first() { + '\'' => { + self.bump(); + let terminated = self.single_quoted_string(); + let kind = mk_kind(terminated); + TokenKind::Literal { kind } + } + '"' if allows_double => { + self.bump(); + let terminated = self.double_quoted_string(); + TokenKind::QuotedIdent { terminated } + } + _ => self.ident_or_unknown_prefix(), + } + } + + fn number(&mut self, first_digit: char) -> LiteralKind { + let mut base = Base::Decimal; + if first_digit == '0' { + // Attempt to parse encoding base. + match self.first() { + // https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L403 + 'b' | 'B' => { + base = Base::Binary; + self.bump(); + if !self.eat_decimal_digits() { + return LiteralKind::Int { + base, + empty_int: true, + }; + } + } + // https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L402 + 'o' | 'O' => { + base = Base::Octal; + self.bump(); + if !self.eat_decimal_digits() { + return LiteralKind::Int { + base, + empty_int: true, + }; + } + } + // https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L401 + 'x' | 'X' => { + base = Base::Hexadecimal; + self.bump(); + if !self.eat_hexadecimal_digits() { + return LiteralKind::Int { + base, + empty_int: true, + }; + } + } + // Not a base prefix; consume additional digits. + '0'..='9' | '_' => { + self.eat_decimal_digits(); + } + + // Also not a base prefix; nothing more to do here. + '.' | 'e' | 'E' => {} + + // Just a 0. + _ => { + return LiteralKind::Int { + base, + empty_int: false, + }; + } + } + } else { + // No base prefix, parse number in the usual way. + self.eat_decimal_digits(); + }; + + match self.first() { + '.' => { + // might have stuff after the ., and if it does, it needs to start + // with a number + self.bump(); + let mut empty_exponent = false; + if self.first().is_ascii_digit() { + self.eat_decimal_digits(); + match self.first() { + 'e' | 'E' => { + self.bump(); + empty_exponent = !self.eat_float_exponent(); + } + _ => (), + } + } else { + match self.first() { + 'e' | 'E' => { + self.bump(); + empty_exponent = !self.eat_float_exponent(); + } + _ => (), + } + } + LiteralKind::Float { + base, + empty_exponent, + } + } + 'e' | 'E' => { + self.bump(); + let empty_exponent = !self.eat_float_exponent(); + LiteralKind::Float { + base, + empty_exponent, + } + } + _ => LiteralKind::Int { + base, + empty_int: false, + }, + } + } + + fn single_quoted_string(&mut self) -> bool { + // Parse until either quotes are terminated or error is detected. + loop { + match self.first() { + // Quotes might be terminated. + '\'' => { + self.bump(); + + match self.first() { + // encountered an escaped quote '' + '\'' => { + self.bump(); + } + // encountered terminating quote + _ => return true, + } + } + // End of file, stop parsing. + EOF_CHAR if self.is_eof() => break, + // Skip the character. + _ => { + self.bump(); + } + } + } + // String was not terminated. + false + } + + /// Eats double-quoted string and returns true + /// if string is terminated. + fn double_quoted_string(&mut self) -> bool { + while let Some(c) = self.bump() { + match c { + '"' if self.first() == '"' => { + // Bump again to skip escaped character. + self.bump(); + } + '"' => { + return true; + } + _ => (), + } + } + // End of file reached. + false + } + + // https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-DOLLAR-QUOTING + fn dollar_quoted_string(&mut self) -> TokenKind { + // Get the start sequence of the dollar quote, i.e., 'foo' in + // $foo$hello$foo$ + let mut start = vec![]; + while let Some(c) = self.bump() { + match c { + '$' => { + self.bump(); + break; + } + _ => { + start.push(c); + } + } + } + + // we have a dollar quoted string deliminated with `$$` + if start.is_empty() { + loop { + self.eat_while(|c| c != '$'); + if self.is_eof() { + return TokenKind::Literal { + kind: LiteralKind::DollarQuotedString { terminated: false }, + }; + } + // eat $ + self.bump(); + if self.first() == '$' { + self.bump(); + return TokenKind::Literal { + kind: LiteralKind::DollarQuotedString { terminated: true }, + }; + } + } + } else { + loop { + self.eat_while(|c| c != start[0]); + if self.is_eof() { + return TokenKind::Literal { + kind: LiteralKind::DollarQuotedString { terminated: false }, + }; + } + + // might be the start of our start/end sequence + let mut match_count = 0; + for start_char in &start { + if self.first() == *start_char { + self.bump(); + match_count += 1; + } else { + self.bump(); + break; + } + } + + // closing '$' + let terminated = match_count == start.len(); + if self.first() == '$' && terminated { + self.bump(); + return TokenKind::Literal { + kind: LiteralKind::DollarQuotedString { terminated }, + }; + } + } + } + } + + fn eat_decimal_digits(&mut self) -> bool { + let mut has_digits = false; + loop { + match self.first() { + '_' => { + self.bump(); + } + '0'..='9' => { + has_digits = true; + self.bump(); + } + _ => break, + } + } + has_digits + } + + fn eat_hexadecimal_digits(&mut self) -> bool { + let mut has_digits = false; + loop { + match self.first() { + '_' => { + self.bump(); + } + '0'..='9' | 'a'..='f' | 'A'..='F' => { + has_digits = true; + self.bump(); + } + _ => break, + } + } + has_digits + } + + /// Eats the float exponent. Returns true if at least one digit was met, + /// and returns false otherwise. + fn eat_float_exponent(&mut self) -> bool { + if self.first() == '-' || self.first() == '+' { + self.bump(); + } + self.eat_decimal_digits() + } +} + +/// Creates an iterator that produces tokens from the input string. +pub fn tokenize(input: &str) -> impl Iterator + '_ { + let mut cursor = Cursor::new(input); + std::iter::from_fn(move || { + let token = cursor.advance_token(); + if token.kind != TokenKind::Eof { + Some(token) + } else { + None + } + }) +} + +#[cfg(test)] +mod tests { + use std::fmt; + + use super::*; + use insta::assert_debug_snapshot; + + struct TokenDebug<'a> { + content: &'a str, + token: Token, + } + impl fmt::Debug for TokenDebug<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{:?} @ {:?}", self.content, self.token.kind) + } + } + + impl<'a> TokenDebug<'a> { + fn new(token: Token, input: &'a str, start: u32) -> TokenDebug<'a> { + TokenDebug { + token, + content: &input[start as usize..(start + token.len) as usize], + } + } + } + + fn lex(input: &str) -> Vec { + let mut tokens = vec![]; + let mut start = 0; + + for token in tokenize(input) { + let length = token.len; + tokens.push(TokenDebug::new(token, input, start)); + start += length; + } + tokens + } + #[test] + fn lex_statement() { + let result = lex("select 1;"); + assert_debug_snapshot!(result); + } + + #[test] + fn block_comment() { + let result = lex(r#" +/* + * foo + * bar +*/"#); + assert_debug_snapshot!(result); + } + + #[test] + fn block_comment_unterminated() { + let result = lex(r#" +/* + * foo + * bar + /* +*/"#); + assert_debug_snapshot!(result); + } + + #[test] + fn line_comment() { + let result = lex(r#" +-- foooooooooooo bar buzz +"#); + assert_debug_snapshot!(result); + } + + #[test] + fn line_comment_whitespace() { + assert_debug_snapshot!(lex(r#" +select 'Hello' -- This is a comment +' World';"#)) + } + + #[test] + fn dollar_quoting() { + assert_debug_snapshot!(lex(r#" +$$Dianne's horse$$ +$SomeTag$Dianne's horse$SomeTag$ + +-- with dollar inside and matching tags +$foo$hello$world$bar$ +"#)) + } + + #[test] + fn dollar_strings_part2() { + assert_debug_snapshot!(lex(r#" +DO $doblock$ +end +$doblock$;"#)) + } + + #[test] + fn dollar_quote_mismatch_tags_simple() { + assert_debug_snapshot!(lex(r#" +-- dollar quoting with mismatched tags +$foo$hello world$bar$ +"#)); + } + + #[test] + fn dollar_quote_mismatch_tags_complex() { + assert_debug_snapshot!(lex(r#" +-- with dollar inside but mismatched tags +$foo$hello$world$bar$ +"#)); + } + + #[test] + fn numeric() { + assert_debug_snapshot!(lex(r#" +42 +3.5 +4. +.001 +.123e10 +5e2 +1.925e-3 +1e-10 +1e+10 +1e10 +4664.E+5 +"#)) + } + + #[test] + fn numeric_non_decimal() { + assert_debug_snapshot!(lex(r#" +0b100101 +0B10011001 +0o273 +0O755 +0x42f +0XFFFF +"#)) + } + + #[test] + fn numeric_with_seperators() { + assert_debug_snapshot!(lex(r#" +1_500_000_000 +0b10001000_00000000 +0o_1_755 +0xFFFF_FFFF +1.618_034 +"#)) + } + + #[test] + fn select_with_period() { + assert_debug_snapshot!(lex(r#" +select public.users; +"#)) + } + + #[test] + fn bitstring() { + assert_debug_snapshot!(lex(r#" +B'1001' +b'1001' +X'1FF' +x'1FF' +"#)) + } + + #[test] + fn string() { + assert_debug_snapshot!(lex(r#" +'Dianne''s horse' + +select 'foo '' +bar'; + +select 'foooo' + 'bar'; + + +'foo \\ \n \tbar' + +'forgot to close the string +"#)) + } + + #[test] + fn params() { + assert_debug_snapshot!(lex(r#" +select $1 + $2; + +select $1123123123123; + +select $; +"#)) + } + + #[test] + fn string_with_escapes() { + // https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-ESCAPE + + assert_debug_snapshot!(lex(r#" +E'foo' + +e'bar' + +e'\b\f\n\r\t' + +e'\0\11\777' + +e'\x0\x11\xFF' + +e'\uAAAA \UFFFFFFFF' + +"#)) + } + + #[test] + fn string_unicode_escape() { + // https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE + + assert_debug_snapshot!(lex(r#" +U&"d\0061t\+000061" + +U&"\0441\043B\043E\043D" + +u&'\0441\043B' + +U&"d!0061t!+000061" UESCAPE '!' +"#)) + } + + #[test] + fn quoted_ident() { + assert_debug_snapshot!(lex(r#" +"hello &1 -world"; + + +"hello-world +"#)) + } + + #[test] + fn quoted_ident_with_escape_quote() { + assert_debug_snapshot!(lex(r#" +"foo "" bar" +"#)) + } +} diff --git a/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__bitstring.snap b/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__bitstring.snap new file mode 100644 index 00000000..ff3eec09 --- /dev/null +++ b/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__bitstring.snap @@ -0,0 +1,16 @@ +--- +source: crates/pgt_tokenizer/src/lib.rs +expression: "lex(r#\"\nB'1001'\nb'1001'\nX'1FF'\nx'1FF'\n\"#)" +snapshot_kind: text +--- +[ + "\n" @ LineEnding { count: 1 }, + "B'1001'" @ Literal { kind: BitStr { terminated: true } }, + "\n" @ LineEnding { count: 1 }, + "b'1001'" @ Literal { kind: BitStr { terminated: true } }, + "\n" @ LineEnding { count: 1 }, + "X'1FF'" @ Literal { kind: ByteStr { terminated: true } }, + "\n" @ LineEnding { count: 1 }, + "x'1FF'" @ Literal { kind: ByteStr { terminated: true } }, + "\n" @ LineEnding { count: 1 }, +] diff --git a/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__block_comment.snap b/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__block_comment.snap new file mode 100644 index 00000000..22961ecf --- /dev/null +++ b/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__block_comment.snap @@ -0,0 +1,9 @@ +--- +source: crates/pgt_tokenizer/src/lib.rs +expression: result +snapshot_kind: text +--- +[ + "\n" @ LineEnding { count: 1 }, + "/*\n * foo\n * bar\n*/" @ BlockComment { terminated: true }, +] diff --git a/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__block_comment_unterminated.snap b/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__block_comment_unterminated.snap new file mode 100644 index 00000000..4dd6957e --- /dev/null +++ b/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__block_comment_unterminated.snap @@ -0,0 +1,9 @@ +--- +source: crates/pgt_tokenizer/src/lib.rs +expression: result +snapshot_kind: text +--- +[ + "\n" @ LineEnding { count: 1 }, + "/*\n * foo\n * bar\n /*\n*/" @ BlockComment { terminated: false }, +] diff --git a/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__dollar_quote_mismatch_tags_complex.snap b/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__dollar_quote_mismatch_tags_complex.snap new file mode 100644 index 00000000..7f6a6649 --- /dev/null +++ b/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__dollar_quote_mismatch_tags_complex.snap @@ -0,0 +1,11 @@ +--- +source: crates/pgt_tokenizer/src/lib.rs +expression: "lex(r#\"\n-- with dollar inside but mismatched tags\n$foo$hello$world$bar$\n\"#)" +snapshot_kind: text +--- +[ + "\n" @ LineEnding { count: 1 }, + "-- with dollar inside but mismatched tags" @ LineComment, + "\n" @ LineEnding { count: 1 }, + "$foo$hello$world$bar$\n" @ Literal { kind: DollarQuotedString { terminated: false } }, +] diff --git a/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__dollar_quote_mismatch_tags_simple.snap b/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__dollar_quote_mismatch_tags_simple.snap new file mode 100644 index 00000000..9d6d43a0 --- /dev/null +++ b/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__dollar_quote_mismatch_tags_simple.snap @@ -0,0 +1,11 @@ +--- +source: crates/pgt_tokenizer/src/lib.rs +expression: "lex(r#\"\n-- dollar quoting with mismatched tags\n$foo$hello world$bar$\n\"#)" +snapshot_kind: text +--- +[ + "\n" @ LineEnding { count: 1 }, + "-- dollar quoting with mismatched tags" @ LineComment, + "\n" @ LineEnding { count: 1 }, + "$foo$hello world$bar$\n" @ Literal { kind: DollarQuotedString { terminated: false } }, +] diff --git a/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__dollar_quoting.snap b/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__dollar_quoting.snap new file mode 100644 index 00000000..ad1aa07d --- /dev/null +++ b/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__dollar_quoting.snap @@ -0,0 +1,15 @@ +--- +source: crates/pgt_tokenizer/src/lib.rs +expression: "lex(r#\"\n$$Dianne's horse$$\n$SomeTag$Dianne's horse$SomeTag$\n\n-- with dollar inside and matching tags\n$foo$hello$world$bar$\n\"#)" +snapshot_kind: text +--- +[ + "\n" @ LineEnding { count: 1 }, + "$$Dianne's horse$$" @ Literal { kind: DollarQuotedString { terminated: true } }, + "\n" @ LineEnding { count: 1 }, + "$SomeTag$Dianne's horse$SomeTag$" @ Literal { kind: DollarQuotedString { terminated: true } }, + "\n\n" @ LineEnding { count: 2 }, + "-- with dollar inside and matching tags" @ LineComment, + "\n" @ LineEnding { count: 1 }, + "$foo$hello$world$bar$\n" @ Literal { kind: DollarQuotedString { terminated: false } }, +] diff --git a/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__dollar_strings_part2.snap b/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__dollar_strings_part2.snap new file mode 100644 index 00000000..9aa49446 --- /dev/null +++ b/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__dollar_strings_part2.snap @@ -0,0 +1,12 @@ +--- +source: crates/pgt_tokenizer/src/lib.rs +expression: "lex(r#\"\nDO $doblock$\nend\n$doblock$;\"#)" +snapshot_kind: text +--- +[ + "\n" @ LineEnding { count: 1 }, + "DO" @ Ident, + " " @ Space, + "$doblock$\nend\n$doblock$" @ Literal { kind: DollarQuotedString { terminated: true } }, + ";" @ Semi, +] diff --git a/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__lex_statement.snap b/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__lex_statement.snap new file mode 100644 index 00000000..5679f2a7 --- /dev/null +++ b/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__lex_statement.snap @@ -0,0 +1,11 @@ +--- +source: crates/pgt_tokenizer/src/lib.rs +expression: result +snapshot_kind: text +--- +[ + "select" @ Ident, + " " @ Space, + "1" @ Literal { kind: Int { base: Decimal, empty_int: false } }, + ";" @ Semi, +] diff --git a/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__line_comment.snap b/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__line_comment.snap new file mode 100644 index 00000000..1cd8782a --- /dev/null +++ b/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__line_comment.snap @@ -0,0 +1,10 @@ +--- +source: crates/pgt_tokenizer/src/lib.rs +expression: result +snapshot_kind: text +--- +[ + "\n" @ LineEnding { count: 1 }, + "-- foooooooooooo bar buzz" @ LineComment, + "\n" @ LineEnding { count: 1 }, +] diff --git a/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__line_comment_whitespace.snap b/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__line_comment_whitespace.snap new file mode 100644 index 00000000..3cf5fb50 --- /dev/null +++ b/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__line_comment_whitespace.snap @@ -0,0 +1,16 @@ +--- +source: crates/pgt_tokenizer/src/lib.rs +expression: "lex(r#\"\nselect 'Hello' -- This is a comment\n' World';\"#)" +snapshot_kind: text +--- +[ + "\n" @ LineEnding { count: 1 }, + "select" @ Ident, + " " @ Space, + "'Hello'" @ Literal { kind: Str { terminated: true } }, + " " @ Space, + "-- This is a comment" @ LineComment, + "\n" @ LineEnding { count: 1 }, + "' World'" @ Literal { kind: Str { terminated: true } }, + ";" @ Semi, +] diff --git a/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__numeric.snap b/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__numeric.snap new file mode 100644 index 00000000..95fdb27a --- /dev/null +++ b/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__numeric.snap @@ -0,0 +1,30 @@ +--- +source: crates/pgt_tokenizer/src/lib.rs +expression: "lex(r#\"\n42\n3.5\n4.\n.001\n.123e10\n5e2\n1.925e-3\n1e-10\n1e+10\n1e10\n4664.E+5\n\"#)" +snapshot_kind: text +--- +[ + "\n" @ LineEnding { count: 1 }, + "42" @ Literal { kind: Int { base: Decimal, empty_int: false } }, + "\n" @ LineEnding { count: 1 }, + "3.5" @ Literal { kind: Float { base: Decimal, empty_exponent: false } }, + "\n" @ LineEnding { count: 1 }, + "4." @ Literal { kind: Float { base: Decimal, empty_exponent: false } }, + "\n" @ LineEnding { count: 1 }, + ".001" @ Literal { kind: Int { base: Decimal, empty_int: false } }, + "\n" @ LineEnding { count: 1 }, + ".123e10" @ Literal { kind: Float { base: Decimal, empty_exponent: false } }, + "\n" @ LineEnding { count: 1 }, + "5e2" @ Literal { kind: Float { base: Decimal, empty_exponent: false } }, + "\n" @ LineEnding { count: 1 }, + "1.925e-3" @ Literal { kind: Float { base: Decimal, empty_exponent: false } }, + "\n" @ LineEnding { count: 1 }, + "1e-10" @ Literal { kind: Float { base: Decimal, empty_exponent: false } }, + "\n" @ LineEnding { count: 1 }, + "1e+10" @ Literal { kind: Float { base: Decimal, empty_exponent: false } }, + "\n" @ LineEnding { count: 1 }, + "1e10" @ Literal { kind: Float { base: Decimal, empty_exponent: false } }, + "\n" @ LineEnding { count: 1 }, + "4664.E+5" @ Literal { kind: Float { base: Decimal, empty_exponent: false } }, + "\n" @ LineEnding { count: 1 }, +] diff --git a/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__numeric_non_decimal.snap b/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__numeric_non_decimal.snap new file mode 100644 index 00000000..e4430348 --- /dev/null +++ b/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__numeric_non_decimal.snap @@ -0,0 +1,20 @@ +--- +source: crates/pgt_tokenizer/src/lib.rs +expression: "lex(r#\"\n0b100101\n0B10011001\n0o273\n0O755\n0x42f\n0XFFFF\n\"#)" +snapshot_kind: text +--- +[ + "\n" @ LineEnding { count: 1 }, + "0b100101" @ Literal { kind: Int { base: Binary, empty_int: false } }, + "\n" @ LineEnding { count: 1 }, + "0B10011001" @ Literal { kind: Int { base: Binary, empty_int: false } }, + "\n" @ LineEnding { count: 1 }, + "0o273" @ Literal { kind: Int { base: Octal, empty_int: false } }, + "\n" @ LineEnding { count: 1 }, + "0O755" @ Literal { kind: Int { base: Octal, empty_int: false } }, + "\n" @ LineEnding { count: 1 }, + "0x42f" @ Literal { kind: Int { base: Hexadecimal, empty_int: false } }, + "\n" @ LineEnding { count: 1 }, + "0XFFFF" @ Literal { kind: Int { base: Hexadecimal, empty_int: false } }, + "\n" @ LineEnding { count: 1 }, +] diff --git a/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__numeric_with_seperators.snap b/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__numeric_with_seperators.snap new file mode 100644 index 00000000..cd0ecb21 --- /dev/null +++ b/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__numeric_with_seperators.snap @@ -0,0 +1,18 @@ +--- +source: crates/pgt_tokenizer/src/lib.rs +expression: "lex(r#\"\n1_500_000_000\n0b10001000_00000000\n0o_1_755\n0xFFFF_FFFF\n1.618_034\n\"#)" +snapshot_kind: text +--- +[ + "\n" @ LineEnding { count: 1 }, + "1_500_000_000" @ Literal { kind: Int { base: Decimal, empty_int: false } }, + "\n" @ LineEnding { count: 1 }, + "0b10001000_00000000" @ Literal { kind: Int { base: Binary, empty_int: false } }, + "\n" @ LineEnding { count: 1 }, + "0o_1_755" @ Literal { kind: Int { base: Octal, empty_int: false } }, + "\n" @ LineEnding { count: 1 }, + "0xFFFF_FFFF" @ Literal { kind: Int { base: Hexadecimal, empty_int: false } }, + "\n" @ LineEnding { count: 1 }, + "1.618_034" @ Literal { kind: Float { base: Decimal, empty_exponent: false } }, + "\n" @ LineEnding { count: 1 }, +] diff --git a/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__params.snap b/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__params.snap new file mode 100644 index 00000000..6a436417 --- /dev/null +++ b/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__params.snap @@ -0,0 +1,27 @@ +--- +source: crates/pgt_tokenizer/src/lib.rs +expression: "lex(r#\"\nselect $1 + $2;\n\nselect $1123123123123;\n\nselect $;\n\"#)" +snapshot_kind: text +--- +[ + "\n" @ LineEnding { count: 1 }, + "select" @ Ident, + " " @ Space, + "$1" @ PositionalParam, + " " @ Space, + "+" @ Plus, + " " @ Space, + "$2" @ PositionalParam, + ";" @ Semi, + "\n\n" @ LineEnding { count: 2 }, + "select" @ Ident, + " " @ Space, + "$1123123123123" @ PositionalParam, + ";" @ Semi, + "\n\n" @ LineEnding { count: 2 }, + "select" @ Ident, + " " @ Space, + "$" @ PositionalParam, + ";" @ Semi, + "\n" @ LineEnding { count: 1 }, +] diff --git a/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__quoted_ident.snap b/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__quoted_ident.snap new file mode 100644 index 00000000..e1dffb06 --- /dev/null +++ b/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__quoted_ident.snap @@ -0,0 +1,12 @@ +--- +source: crates/pgt_tokenizer/src/lib.rs +expression: "lex(r#\"\n\"hello &1 -world\";\n\n\n\"hello-world\n\"#)" +snapshot_kind: text +--- +[ + "\n" @ LineEnding { count: 1 }, + "\"hello &1 -world\"" @ QuotedIdent { terminated: true }, + ";" @ Semi, + "\n\n\n" @ LineEnding { count: 3 }, + "\"hello-world\n" @ QuotedIdent { terminated: false }, +] diff --git a/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__quoted_ident_with_escape_quote.snap b/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__quoted_ident_with_escape_quote.snap new file mode 100644 index 00000000..44ff06e5 --- /dev/null +++ b/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__quoted_ident_with_escape_quote.snap @@ -0,0 +1,10 @@ +--- +source: crates/pgt_tokenizer/src/lib.rs +expression: "lex(r#\"\n\"foo \"\" bar\"\n\"#)" +snapshot_kind: text +--- +[ + "\n" @ LineEnding { count: 1 }, + "\"foo \"\" bar\"" @ QuotedIdent { terminated: true }, + "\n" @ LineEnding { count: 1 }, +] diff --git a/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__select_with_period.snap b/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__select_with_period.snap new file mode 100644 index 00000000..bc03da6a --- /dev/null +++ b/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__select_with_period.snap @@ -0,0 +1,15 @@ +--- +source: crates/pgt_tokenizer/src/lib.rs +expression: "lex(r#\"\nselect public.users;\n\"#)" +snapshot_kind: text +--- +[ + "\n" @ LineEnding { count: 1 }, + "select" @ Ident, + " " @ Space, + "public" @ Ident, + "." @ Dot, + "users" @ Ident, + ";" @ Semi, + "\n" @ LineEnding { count: 1 }, +] diff --git a/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__string.snap b/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__string.snap new file mode 100644 index 00000000..c7e5b8ba --- /dev/null +++ b/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__string.snap @@ -0,0 +1,26 @@ +--- +source: crates/pgt_tokenizer/src/lib.rs +expression: "lex(r#\"\n'Dianne''s horse'\n\nselect 'foo ''\nbar';\n\nselect 'foooo'\n 'bar';\n\n\n'foo \\\\ \\n \\tbar'\n\n'forgot to close the string\n\"#)" +snapshot_kind: text +--- +[ + "\n" @ LineEnding { count: 1 }, + "'Dianne''s horse'" @ Literal { kind: Str { terminated: true } }, + "\n\n" @ LineEnding { count: 2 }, + "select" @ Ident, + " " @ Space, + "'foo ''\nbar'" @ Literal { kind: Str { terminated: true } }, + ";" @ Semi, + "\n\n" @ LineEnding { count: 2 }, + "select" @ Ident, + " " @ Space, + "'foooo'" @ Literal { kind: Str { terminated: true } }, + "\n" @ LineEnding { count: 1 }, + " " @ Space, + "'bar'" @ Literal { kind: Str { terminated: true } }, + ";" @ Semi, + "\n\n\n" @ LineEnding { count: 3 }, + "'foo \\\\ \\n \\tbar'" @ Literal { kind: Str { terminated: true } }, + "\n\n" @ LineEnding { count: 2 }, + "'forgot to close the string\n" @ Literal { kind: Str { terminated: false } }, +] diff --git a/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__string_unicode_escape.snap b/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__string_unicode_escape.snap new file mode 100644 index 00000000..225a208a --- /dev/null +++ b/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__string_unicode_escape.snap @@ -0,0 +1,20 @@ +--- +source: crates/pgt_tokenizer/src/lib.rs +expression: "lex(r#\"\nU&\"d\\0061t\\+000061\"\n\nU&\"\\0441\\043B\\043E\\043D\"\n\nu&'\\0441\\043B'\n\nU&\"d!0061t!+000061\" UESCAPE '!'\n\"#)" +snapshot_kind: text +--- +[ + "\n" @ LineEnding { count: 1 }, + "U&\"d\\0061t\\+000061\"" @ QuotedIdent { terminated: true }, + "\n\n" @ LineEnding { count: 2 }, + "U&\"\\0441\\043B\\043E\\043D\"" @ QuotedIdent { terminated: true }, + "\n\n" @ LineEnding { count: 2 }, + "u&'\\0441\\043B'" @ Literal { kind: UnicodeEscStr { terminated: true } }, + "\n\n" @ LineEnding { count: 2 }, + "U&\"d!0061t!+000061\"" @ QuotedIdent { terminated: true }, + " " @ Space, + "UESCAPE" @ Ident, + " " @ Space, + "'!'" @ Literal { kind: Str { terminated: true } }, + "\n" @ LineEnding { count: 1 }, +] diff --git a/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__string_with_escapes.snap b/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__string_with_escapes.snap new file mode 100644 index 00000000..bbc94048 --- /dev/null +++ b/crates/pgt_tokenizer/src/snapshots/pgt_tokenizer__tests__string_with_escapes.snap @@ -0,0 +1,20 @@ +--- +source: crates/pgt_tokenizer/src/lib.rs +expression: "lex(r#\"\nE'foo'\n\ne'bar'\n\ne'\\b\\f\\n\\r\\t'\n\ne'\\0\\11\\777'\n\ne'\\x0\\x11\\xFF'\n\ne'\\uAAAA \\UFFFFFFFF'\n\n\"#)" +snapshot_kind: text +--- +[ + "\n" @ LineEnding { count: 1 }, + "E'foo'" @ Literal { kind: EscStr { terminated: true } }, + "\n\n" @ LineEnding { count: 2 }, + "e'bar'" @ Literal { kind: EscStr { terminated: true } }, + "\n\n" @ LineEnding { count: 2 }, + "e'\\b\\f\\n\\r\\t'" @ Literal { kind: EscStr { terminated: true } }, + "\n\n" @ LineEnding { count: 2 }, + "e'\\0\\11\\777'" @ Literal { kind: EscStr { terminated: true } }, + "\n\n" @ LineEnding { count: 2 }, + "e'\\x0\\x11\\xFF'" @ Literal { kind: EscStr { terminated: true } }, + "\n\n" @ LineEnding { count: 2 }, + "e'\\uAAAA \\UFFFFFFFF'" @ Literal { kind: EscStr { terminated: true } }, + "\n\n" @ LineEnding { count: 2 }, +] diff --git a/crates/pgt_tokenizer/src/token.rs b/crates/pgt_tokenizer/src/token.rs new file mode 100644 index 00000000..50a7d12a --- /dev/null +++ b/crates/pgt_tokenizer/src/token.rs @@ -0,0 +1,170 @@ +// based on: https://github.com/rust-lang/rust/blob/d1b7355d3d7b4ead564dbecb1d240fcc74fff21b/compiler/rustc_lexer/src/lib.rs#L58 +#[derive(Debug, PartialEq, Clone, Copy)] +pub enum TokenKind { + /// Used when there's an error of some sort while lexing. + Unknown, + /// Examples: `12u8`, `1.0e-40`, `b"123"`. Note that `_` is an invalid + /// suffix, but may be present here on string and float literals. Users of + /// this type will need to check for and reject that case. + /// + /// See [`LiteralKind`] for more details. + Literal { + kind: LiteralKind, + }, + /// Whitespace characters. + Space, + Tab, + VerticalTab, + FormFeed, + // Handles \n, \r, and sequences + LineEnding { + count: usize, + }, + /// Identifier + /// + /// case-sensitive + Ident, + /// `;` + Semi, + /// End of file + Eof, + /// `/` + Slash, + /// `\` + Backslash, + /// `-- foo` + LineComment, + /// ``` + /// /* + /// foo + /// */ + /// ``` + BlockComment { + terminated: bool, + }, + /// `-` + Minus, + /// `:` + Colon, + /// `.` + Dot, + /// `=` + Eq, + /// `>` + Gt, + /// `&` + And, + /// `<` + Lt, + /// `!` + Bang, + /// `+` + Plus, + /// `~` + Tilde, + /// `#` + Pound, + /// `?` + Question, + /// `|` + Or, + /// `%` + Percent, + /// `^` + Caret, + /// `*` + Star, + /// `` ` `` + Backtick, + /// `@` + At, + /// `]` + CloseBracket, + /// `[` + OpenBracket, + /// `)` + CloseParen, + /// `(` + OpenParen, + /// `,` + Comma, + /// Error case that we need to report later on. + UnknownPrefix, + /// Positional Parameter, e.g., `$1` + /// + /// see: + PositionalParam, + /// Quoted Identifier, e.g., `"update"` in `update "my_table" set "a" = 5;` + /// + /// These are case-sensitive, unlike [`TokenKind::Ident`] + /// + /// see: + QuotedIdent { + terminated: bool, + }, +} + +/// Parsed token. +/// It doesn't contain information about data that has been parsed, +/// only the type of the token and its size. +#[derive(Debug, Clone, Copy)] +pub struct Token { + pub kind: TokenKind, + pub len: u32, +} + +impl Token { + pub(crate) fn new(kind: TokenKind, len: u32) -> Token { + Token { kind, len } + } +} + +/// Base of numeric literal encoding according to its prefix. +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub enum Base { + /// Literal starts with "0b". + Binary = 2, + /// Literal starts with "0o". + Octal = 8, + /// Literal doesn't contain a prefix. + Decimal = 10, + /// Literal starts with "0x". + Hexadecimal = 16, +} + +// Enum representing the literal types supported by the lexer. +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub enum LiteralKind { + /// Integer Numeric, e.g., `42` + /// + /// see: + Int { base: Base, empty_int: bool }, + /// Float Numeric, e.g., `1.925e-3` + /// + /// see: + Float { base: Base, empty_exponent: bool }, + /// String, e.g., `'foo'` + /// + /// see: + Str { terminated: bool }, + /// Hexidecimal Bit String, e.g., `X'1FF'` + /// + /// see: + ByteStr { terminated: bool }, + /// Bit String, e.g., `B'1001'` + /// + /// see: + BitStr { terminated: bool }, + /// Dollar Quoted String, e.g., `$$Dianne's horse$$` + /// + /// see: + DollarQuotedString { terminated: bool }, + /// Unicode Escape String, e.g., `U&'d\0061t\+000061'` + /// + /// see: + UnicodeEscStr { terminated: bool }, + /// Escape String, e.g, `E'foo'` + /// + /// see: + EscStr { terminated: bool }, +} diff --git a/crates/pgt_workspace/src/workspace/server/annotation.rs b/crates/pgt_workspace/src/workspace/server/annotation.rs index 2fdf32eb..db6a8b3b 100644 --- a/crates/pgt_workspace/src/workspace/server/annotation.rs +++ b/crates/pgt_workspace/src/workspace/server/annotation.rs @@ -1,7 +1,7 @@ use std::sync::Arc; use dashmap::DashMap; -use pgt_lexer::{SyntaxKind, WHITESPACE_TOKENS}; +use pgt_lexer::SyntaxKind; use super::statement_identifier::StatementId; @@ -11,9 +11,18 @@ pub struct StatementAnnotations { } pub struct AnnotationStore { - db: DashMap>>, + db: DashMap>, } +const WHITESPACE_TOKENS: [SyntaxKind; 6] = [ + SyntaxKind::SPACE, + SyntaxKind::TAB, + SyntaxKind::VERTICAL_TAB, + SyntaxKind::FORM_FEED, + SyntaxKind::LINE_ENDING, + SyntaxKind::EOF, +]; + impl AnnotationStore { pub fn new() -> AnnotationStore { AnnotationStore { db: DashMap::new() } @@ -24,26 +33,26 @@ impl AnnotationStore { &self, statement_id: &StatementId, content: &str, - ) -> Option> { + ) -> Arc { if let Some(existing) = self.db.get(statement_id).map(|x| x.clone()) { return existing; } - // we swallow the error here because the lexing within the document would have already - // thrown and we wont even get here if that happened. - let annotations = pgt_lexer::lex(content).ok().map(|tokens| { - let ends_with_semicolon = tokens - .iter() - .rev() - .find(|token| !WHITESPACE_TOKENS.contains(&token.kind)) - .is_some_and(|token| token.kind == SyntaxKind::Ascii59); - - Arc::new(StatementAnnotations { - ends_with_semicolon, - }) + let lexed = pgt_lexer::lex(content); + + let ends_with_semicolon = (0..lexed.len()) + // Iterate through tokens in reverse to find the last non-whitespace token + .filter(|t| !WHITESPACE_TOKENS.contains(&lexed.kind(*t))) + .next_back() + .map(|t| lexed.kind(t) == SyntaxKind::SEMICOLON) + .unwrap_or(false); + + let annotations = Arc::new(StatementAnnotations { + ends_with_semicolon, }); - self.db.insert(statement_id.clone(), None); + self.db.insert(statement_id.clone(), annotations.clone()); + annotations } @@ -80,8 +89,7 @@ mod tests { let annotations = store.get_annotations(&statement_id, content); - assert!(annotations.is_some()); - assert_eq!(annotations.unwrap().ends_with_semicolon, *expected); + assert_eq!(annotations.ends_with_semicolon, *expected); } } } diff --git a/crates/pgt_workspace/src/workspace/server/change.rs b/crates/pgt_workspace/src/workspace/server/change.rs index 62e3da03..cc455134 100644 --- a/crates/pgt_workspace/src/workspace/server/change.rs +++ b/crates/pgt_workspace/src/workspace/server/change.rs @@ -445,9 +445,7 @@ fn get_affected(content: &str, range: TextRange) -> &str { #[cfg(test)] mod tests { - use super::*; - use pgt_diagnostics::Diagnostic; use pgt_text_size::TextRange; use crate::workspace::{ChangeFileParams, ChangeParams}; @@ -462,9 +460,7 @@ mod tests { } fn assert_document_integrity(d: &Document) { - let ranges = pgt_statement_splitter::split(&d.content) - .expect("Unexpected scan error") - .ranges; + let ranges = pgt_statement_splitter::split(&d.content).ranges; assert!( ranges.len() == d.positions.len(), @@ -479,16 +475,6 @@ mod tests { ); } - #[test] - fn open_doc_with_scan_error() { - let input = "select id from users;\n\n\n\nselect 1443ddwwd33djwdkjw13331333333333;"; - - let d = Document::new(input.to_string(), 0); - - assert_eq!(d.positions.len(), 0); - assert!(d.has_fatal_error()); - } - #[test] fn comments_at_begin() { let path = PgTPath::new("test.sql"); @@ -621,149 +607,6 @@ mod tests { assert_document_integrity(&d); } - #[test] - fn change_into_scan_error_within_statement() { - let path = PgTPath::new("test.sql"); - let input = "select id from users;\n\n\n\nselect 1;"; - - let mut d = Document::new(input.to_string(), 0); - - assert_eq!(d.positions.len(), 2); - assert!(!d.has_fatal_error()); - - let change = ChangeFileParams { - path: path.clone(), - version: 1, - changes: vec![ChangeParams { - text: "d".to_string(), - range: Some(TextRange::new(33.into(), 33.into())), - }], - }; - - let changed = d.apply_file_change(&change); - - assert_eq!(d.content, "select id from users;\n\n\n\nselect 1d;"); - assert!( - changed - .iter() - .all(|c| matches!(c, StatementChange::Deleted(_))), - "should delete all statements" - ); - assert!(d.positions.is_empty(), "should clear all positions"); - assert_eq!(d.diagnostics.len(), 1, "should return a scan error"); - assert_eq!( - d.diagnostics[0].location().span, - Some(TextRange::new(32.into(), 34.into())), - "should have correct span" - ); - assert!(d.has_fatal_error()); - } - - #[test] - fn change_into_scan_error_across_statements() { - let path = PgTPath::new("test.sql"); - let input = "select id from users;\n\n\n\nselect 1;"; - - let mut d = Document::new(input.to_string(), 0); - - assert_eq!(d.positions.len(), 2); - assert!(!d.has_fatal_error()); - - let change = ChangeFileParams { - path: path.clone(), - version: 1, - changes: vec![ChangeParams { - text: "1d".to_string(), - range: Some(TextRange::new(7.into(), 33.into())), - }], - }; - - let changed = d.apply_file_change(&change); - - assert_eq!(d.content, "select 1d;"); - assert!( - changed - .iter() - .all(|c| matches!(c, StatementChange::Deleted(_))), - "should delete all statements" - ); - assert!(d.positions.is_empty(), "should clear all positions"); - assert_eq!(d.diagnostics.len(), 1, "should return a scan error"); - assert_eq!( - d.diagnostics[0].location().span, - Some(TextRange::new(7.into(), 9.into())), - "should have correct span" - ); - assert!(d.has_fatal_error()); - } - - #[test] - fn change_from_invalid_to_invalid() { - let path = PgTPath::new("test.sql"); - let input = "select 1d;"; - - let mut d = Document::new(input.to_string(), 0); - - assert_eq!(d.positions.len(), 0); - assert!(d.has_fatal_error()); - assert_eq!(d.diagnostics.len(), 1); - - let change = ChangeFileParams { - path: path.clone(), - version: 1, - changes: vec![ChangeParams { - text: "2e".to_string(), - range: Some(TextRange::new(7.into(), 9.into())), - }], - }; - - let changed = d.apply_file_change(&change); - - assert_eq!(d.content, "select 2e;"); - assert!(changed.is_empty(), "should not emit any changes"); - assert!(d.positions.is_empty(), "should keep positions empty"); - assert_eq!(d.diagnostics.len(), 1, "should still have a scan error"); - assert_eq!( - d.diagnostics[0].location().span, - Some(TextRange::new(7.into(), 9.into())), - "should have updated span" - ); - assert!(d.has_fatal_error()); - } - - #[test] - fn change_from_invalid_to_valid() { - let path = PgTPath::new("test.sql"); - let input = "select 1d;"; - - let mut d = Document::new(input.to_string(), 0); - - assert_eq!(d.positions.len(), 0); - assert!(d.has_fatal_error()); - assert_eq!(d.diagnostics.len(), 1); - - let change = ChangeFileParams { - path: path.clone(), - version: 1, - changes: vec![ChangeParams { - text: "1".to_string(), - range: Some(TextRange::new(7.into(), 9.into())), - }], - }; - - let changed = d.apply_file_change(&change); - - assert_eq!(d.content, "select 1;"); - assert_eq!(changed.len(), 1, "should emit one change"); - assert!(matches!( - changed[0], - StatementChange::Added(AddedStatement { .. }) - )); - assert_eq!(d.positions.len(), 1, "should have one position"); - assert!(d.diagnostics.is_empty(), "should have no diagnostics"); - assert!(!d.has_fatal_error()); - } - #[test] fn within_statements() { let path = PgTPath::new("test.sql"); diff --git a/crates/pgt_workspace/src/workspace/server/document.rs b/crates/pgt_workspace/src/workspace/server/document.rs index ed0ca40f..89516b23 100644 --- a/crates/pgt_workspace/src/workspace/server/document.rs +++ b/crates/pgt_workspace/src/workspace/server/document.rs @@ -62,32 +62,21 @@ pub(crate) fn split_with_diagnostics( offset: Option, ) -> (Vec, Vec) { let o = offset.unwrap_or_else(|| 0.into()); - match pgt_statement_splitter::split(content) { - Ok(parse) => ( - parse.ranges, - parse - .errors - .into_iter() - .map(|err| { - SDiagnostic::new( - err.clone() - .with_file_span(err.location().span.map(|r| r + o)), - ) - }) - .collect(), - ), - Err(errs) => ( - vec![], - errs.into_iter() - .map(|err| { - SDiagnostic::new( - err.clone() - .with_file_span(err.location().span.map(|r| r + o)), - ) - }) - .collect(), - ), - } + let result = pgt_statement_splitter::split(content); + + ( + result.ranges, + result + .errors + .into_iter() + .map(|err| { + SDiagnostic::new( + err.clone() + .with_file_span(err.location().span.map(|r| r + o)), + ) + }) + .collect(), + ) } pub struct StatementIterator<'a> { diff --git a/docs/codegen/src/rules_docs.rs b/docs/codegen/src/rules_docs.rs index 92f0dc42..68db53db 100644 --- a/docs/codegen/src/rules_docs.rs +++ b/docs/codegen/src/rules_docs.rs @@ -442,7 +442,7 @@ fn print_diagnostics( }); // split and parse each statement - let stmts = pgt_statement_splitter::split(code).expect("unexpected parse error"); + let stmts = pgt_statement_splitter::split(code); for stmt in stmts.ranges { match pgt_query_ext::parse(&code[stmt]) { Ok(ast) => { diff --git a/xtask/rules_check/src/lib.rs b/xtask/rules_check/src/lib.rs index 68a6d650..da4b4c73 100644 --- a/xtask/rules_check/src/lib.rs +++ b/xtask/rules_check/src/lib.rs @@ -126,52 +126,47 @@ fn assert_lint( filter, }); - // split and parse each statement - match pgt_statement_splitter::split(code) { - Ok(stmts) => { - for stmt in stmts.ranges { - match pgt_query_ext::parse(&code[stmt]) { - Ok(ast) => { - for rule_diag in analyser.run(pgt_analyser::AnalyserContext { root: &ast }) - { - let diag = pgt_diagnostics::serde::Diagnostic::new(rule_diag); - - let category = diag.category().expect("linter diagnostic has no code"); - let severity = settings.get_severity_from_rule_code(category).expect( + let result = pgt_statement_splitter::split(code); + for stmt in result.ranges { + match pgt_query_ext::parse(&code[stmt]) { + Ok(ast) => { + for rule_diag in analyser.run(pgt_analyser::AnalyserContext { root: &ast }) { + let diag = pgt_diagnostics::serde::Diagnostic::new(rule_diag); + + let category = diag.category().expect("linter diagnostic has no code"); + let severity = settings.get_severity_from_rule_code(category).expect( "If you see this error, it means you need to run cargo codegen-configuration", ); - let error = diag - .with_severity(severity) - .with_file_path(&file_path) - .with_file_source_code(code); - - write_diagnostic(code, error)?; - } - } - Err(e) => { - let error = SyntaxDiagnostic::from(e) - .with_file_path(&file_path) - .with_file_source_code(code); - write_diagnostic(code, error)?; - } - }; + let error = diag + .with_severity(severity) + .with_file_path(&file_path) + .with_file_source_code(code); + + write_diagnostic(code, error)?; + } } - } - Err(errs) => { - // Print all diagnostics to help the user - let mut console = pgt_console::EnvConsole::default(); - for err in errs { - console.println( - pgt_console::LogLevel::Error, - markup! { - {PrintDiagnostic::verbose(&err)} - }, - ); + Err(e) => { + let error = SyntaxDiagnostic::from(e) + .with_file_path(&file_path) + .with_file_source_code(code); + write_diagnostic(code, error)?; } - bail!("Analysis of '{group}/{rule}' on the following code block returned a scan diagnostic.\n\n{code}"); + }; + } + if !result.errors.is_empty() { + // Print all diagnostics to help the user + let mut console = pgt_console::EnvConsole::default(); + for err in result.errors { + console.println( + pgt_console::LogLevel::Error, + markup! { + {PrintDiagnostic::verbose(&err)} + }, + ); } - }; + bail!("Analysis of '{group}/{rule}' on the following code block returned a scan diagnostic.\n\n{code}"); + } Ok(()) }