Combine all EdgeQL grammars into a single one (#6175)

2024-09-16 18:59:05 +00:00 · 2023-09-28 19:29:39 +02:00 · 2023-09-28 19:29:39 +02:00 · 7c1de21247
commit 7c1de21247
parent 9b8b8f58ec
24 changed files with 315 additions and 453 deletions
--- a/edb/common/parsing.py
+++ b/edb/common/parsing.py
@ -51,7 +51,7 @@ class TokenMeta(type):
        if precedence_class is not None:
            result._precedence_class = precedence_class

-        if name == 'Token':
+        if name == 'Token' or name == 'GrammarToken':
            return result

        if token is None:
--- a/edb/edgeql-parser/edgeql-parser-python/src/lib.rs
+++ b/edb/edgeql-parser/edgeql-parser-python/src/lib.rs
@ -57,7 +57,7 @@ py_module_initializer!(
        m.add(
            py,
            "cache_spec",
-            py_fn!(py, cache_spec(grammar_name: &PyString, py_spec: &PyObject)),
+            py_fn!(py, cache_spec(py_spec: &PyObject)),
        )?;
        m.add(py, "CSTNode", py.get_type::<CSTNode>())?;
        m.add(py, "Production", py.get_type::<Production>())?;
--- a/edb/edgeql-parser/edgeql-parser-python/src/normalize.rs
+++ b/edb/edgeql-parser/edgeql-parser-python/src/normalize.rs
@ -179,7 +179,8 @@ fn is_operator(token: &Token) -> bool {
        | OpenBrace | CloseBrace | Dot | Semicolon | Colon | Add | Sub | Mul | Div | Modulo
        | Pow | Less | Greater | Eq | Ampersand | Pipe | At => true,
        DecimalConst | FloatConst | IntConst | BigIntConst | BinStr | Argument | Str
-        | BacktickName | Keyword(_) | Ident | Substitution | EOF | EOI | Epsilon => false,
+        | BacktickName | Keyword(_) | Ident | Substitution | EOF | EOI | Epsilon | StartBlock
+        | StartExtension | StartFragment | StartMigration | StartSDLDocument => false,
    }
 }

--- a/edb/edgeql-parser/edgeql-parser-python/src/parser.rs
+++ b/edb/edgeql-parser/edgeql-parser-python/src/parser.rs
@ -1,34 +1,22 @@
-use std::collections::HashMap;
-use std::sync::Mutex;
+use std::sync::OnceLock;

 use cpython::{
-    ObjectProtocol, PyClone, PyInt, PyList, PyObject, PyResult, PyString, PyTuple, Python,
-    PythonObject, PythonObjectWithCheckedDowncast, ToPyObject, PyNone,
+    ObjectProtocol, PyClone, PyInt, PyList, PyNone, PyObject, PyResult, PyString, PyTuple, Python,
+    PythonObject, PythonObjectWithCheckedDowncast, ToPyObject,
 };

 use edgeql_parser::parser;
-use once_cell::sync::Lazy;

 use crate::errors::{parser_error_into_tuple, ParserResult};
 use crate::pynormalize::value_to_py_object;
 use crate::tokenizer::OpaqueToken;

-pub fn parse(py: Python, grammar_name: &PyString, tokens: PyObject) -> PyResult<PyTuple> {
-    let mut spec_cache = PARSER_SPECS.lock().unwrap();
+pub fn parse(py: Python, start_token_name: &PyString, tokens: PyObject) -> PyResult<PyTuple> {
+    let start_token_name = start_token_name.to_string(py).unwrap();

-    let grammar_name_str = grammar_name.to_string(py)?;
-    let (spec, productions) = match spec_cache.get(grammar_name_str.as_ref()) {
-        Some(spec) => spec,
-        None => {
-            let parsing_mod = py.import("edb.common.parsing")?;
-            let load_parser_spec = parsing_mod.get(py, "load_parser_spec")?;
-            let grammar_mod = py.import(grammar_name_str.as_ref())?;
-            let py_spec = load_parser_spec.call(py, (grammar_mod,), None)?;
-            _load_spec(py, &mut spec_cache, grammar_name_str.as_ref(), &py_spec)?
-        },
-    };
+    let (spec, productions) = get_spec(py)?;

-    let tokens = downcast_tokens(py, tokens)?;
+    let tokens = downcast_tokens(py, &start_token_name, tokens)?;

    let context = parser::Context::new(spec);
    let (cst, errors) = parser::parse(&tokens, &context);
@ -90,39 +78,62 @@ py_class!(pub class Terminal |py| {
    }
 });

-type ParserSpecs = HashMap<String, (parser::Spec, PyObject)>;
+static PARSER_SPECS: OnceLock<(parser::Spec, PyObject)> = OnceLock::new();

-static PARSER_SPECS: Lazy<Mutex<ParserSpecs>> = Lazy::new(|| Mutex::new(HashMap::new()));
-
-fn downcast_tokens<'a>(py: Python, token_list: PyObject) -> PyResult<Vec<parser::Terminal>> {
+fn downcast_tokens<'a>(
+    py: Python,
+    start_token_name: &str,
+    token_list: PyObject,
+) -> PyResult<Vec<parser::Terminal>> {
    let tokens = PyList::downcast_from(py, token_list)?;

-    let mut buf = Vec::with_capacity(tokens.len(py));
+    let mut buf = Vec::with_capacity(tokens.len(py) + 1);
+    buf.push(parser::Terminal::from_start_name(start_token_name));
    for token in tokens.iter(py) {
        let token = OpaqueToken::downcast_from(py, token)?;
        let token = token.inner(py);

        buf.push(parser::Terminal::from_token(token));
    }
+
+    // adjust the span of the starting token for nicer error message spans
+    if buf.len() >= 2 {
+        buf[0].span.start = buf[1].span.start;
+        buf[0].span.end = buf[1].span.start;
+    }
+
    Ok(buf)
 }

-pub fn cache_spec(
-    py: Python,
-    grammar_name: &PyString,
-    py_spec: &PyObject,
-) -> PyResult<PyNone> {
-    let mut parser_specs = PARSER_SPECS.lock().unwrap();
-    _load_spec(py, &mut parser_specs, grammar_name.to_string(py)?.as_ref(), py_spec)?;
+pub fn cache_spec(py: Python, py_spec: &PyObject) -> PyResult<PyNone> {
+    if PARSER_SPECS.get().is_some() {
+        return Ok(PyNone);
+    }
+
+    let x = load_spec(py, py_spec)?;
+    PARSER_SPECS.set(x).ok();
    Ok(PyNone)
 }

-fn _load_spec<'a>(
-    py: Python,
-    specs: &'a mut ParserSpecs,
-    grammar_name: &str,
-    py_spec: &PyObject,
-) -> PyResult<&'a (parser::Spec, PyObject)> {
+fn get_spec(py: Python<'_>) -> Result<&(parser::Spec, PyObject), cpython::PyErr> {
+    if let Some(x) = PARSER_SPECS.get() {
+        return Ok(x);
+    }
+
+    let parsing_mod = py.import("edb.common.parsing")?;
+    let load_parser_spec = parsing_mod.get(py, "load_parser_spec")?;
+
+    let grammar_name = "edb.edgeql.parser.grammar.start";
+    let grammar_mod = py.import(grammar_name)?;
+    let py_spec = load_parser_spec.call(py, (grammar_mod,), None)?;
+
+    let x = load_spec(py, &py_spec)?;
+
+    PARSER_SPECS.set(x).ok();
+    Ok(PARSER_SPECS.get().unwrap())
+}
+
+fn load_spec(py: Python, py_spec: &PyObject) -> PyResult<(parser::Spec, PyObject)> {
    let spec_to_json = py.import("edb.common.parsing")?.get(py, "spec_to_json")?;

    let res = spec_to_json.call(py, (py_spec,), None)?;
@ -132,11 +143,8 @@ fn _load_spec<'a>(
    let spec_json = spec_json.to_string(py).unwrap();
    let spec = parser::Spec::from_json(&spec_json).unwrap();
    let productions = res.get_item(py, 1);
-    let result = (spec, productions);

-    specs.insert(grammar_name.to_string(), result);
-
-    Ok(specs.get(grammar_name).unwrap())
+    Ok((spec, productions))
 }

 fn to_py_cst<'a>(cst: &'a parser::CSTNode<'a>, py: Python) -> PyResult<CSTNode> {
--- a/edb/edgeql-parser/src/parser.rs
+++ b/edb/edgeql-parser/src/parser.rs
@ -4,7 +4,7 @@ use append_only_vec::AppendOnlyVec;
 use indexmap::IndexMap;

 use crate::helpers::quote_name;
-use crate::keywords::Keyword;
+use crate::keywords::{self, Keyword};
 use crate::position::Span;
 use crate::tokenizer::{Error, Kind, Token, Value};

@ -81,7 +81,7 @@ pub fn parse<'a>(input: &'a [Terminal], ctx: &'a Context) -> (Option<&'a CSTNode

                    let injection = new_token_for_injection(*token_kind, ctx);

-                    let cost = error_cost(token_kind);
+                    let cost = injection_cost(token_kind);
                    let error = Error::new(format!("Missing {injection}")).with_span(gap_span);
                    inject.push_error(error, cost);

@ -519,12 +519,16 @@ const ERROR_COST_INJECT_MAX: u16 = 15;
 const ERROR_COST_SKIP: u16 = 3;
 const ERROR_COST_CUSTOM_ERROR: u16 = 3;

-fn error_cost(kind: &Kind) -> u16 {
+fn injection_cost(kind: &Kind) -> u16 {
    use Kind::*;

    match kind {
        Ident => 9,
        Substitution => 8,
+
+        // A few keywords that should not be injected since they result in
+        // confusing error messages.
+        Keyword(keywords::Keyword("delete" | "update" | "link")) => 100,
        Keyword(_) => 10,

        Dot => 5,
@ -576,6 +580,17 @@ impl Terminal {
            is_placeholder: false,
        }
    }
+
+    #[cfg(feature = "serde")]
+    pub fn from_start_name(start_name: &str) -> Self {
+        Terminal {
+            kind: get_token_kind(start_name),
+            text: "".to_string(),
+            value: None,
+            span: Default::default(),
+            is_placeholder: false,
+        }
+    }
 }

 #[cfg(feature = "serde")]
@ -662,6 +677,12 @@ fn get_token_kind(token_name: &str) -> Kind {
        "NICONST" => BigIntConst,
        "SCONST" => Str,

+        "STARTBLOCK" => StartBlock,
+        "STARTEXTENSION" => StartExtension,
+        "STARTFRAGMENT" => StartFragment,
+        "STARTMIGRATION" => StartMigration,
+        "STARTSDLDOCUMENT" => StartSDLDocument,
+
        "+=" => AddAssign,
        "->" => Arrow,
        ":=" => Assign,
--- a/edb/edgeql-parser/src/tokenizer.rs
+++ b/edb/edgeql-parser/src/tokenizer.rs
@ -127,6 +127,12 @@ pub enum Kind {
    EOF,
    EOI,     // <$> (needed for LR parser)
    Epsilon, // <e> (needed for LR parser)
+
+    StartBlock,
+    StartExtension,
+    StartFragment,
+    StartMigration,
+    StartSDLDocument,
 }

 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
--- a/edb/edgeql/parser/init.py
+++ b/edb/edgeql/parser/init.py
@ -16,20 +16,16 @@
 # limitations under the License.
 #

-
 from __future__ import annotations
 from typing import *

-import importlib
-import multiprocessing
-import types
-
 from edb import errors
 from edb.common import parsing

 import edb._edgeql_parser as rust_parser

 from . import grammar as qlgrammar
+from .grammar import tokens

 from .. import ast as qlast
 from .. import tokenizer as qltokenizer
@ -61,7 +57,7 @@ def parse_fragment(
    source: Union[qltokenizer.Source, str],
    filename: Optional[str] = None,
 ) -> qlast.Expr:
-    res = parse(qlgrammar.fragment, source, filename=filename)
+    res = parse(tokens.T_STARTFRAGMENT, source, filename=filename)
    assert isinstance(res, qlast.Expr)
    return res

@ -90,7 +86,7 @@ def parse_block(
    source: qltokenizer.Source | str,
    module_aliases: Optional[Mapping[Optional[str], str]] = None,
 ) -> list[qlast.Base]:
-    trees = parse(qlgrammar.block, source)
+    trees = parse(tokens.T_STARTBLOCK, source)
    if module_aliases:
        for tree in trees:
            append_module_aliases(tree, module_aliases)
@ -105,7 +101,7 @@ def parse_migration_body_block(
    # (without braces)", so we just hack around this by adding braces.
    # This is only really workable because we only use this in a place
    # where the source contexts don't matter anyway.
-    return parse(qlgrammar.migration_body, f"{{{source}}}")
+    return parse(tokens.T_STARTMIGRATION, f"{{{source}}}")


 def parse_extension_package_body_block(
@ -116,22 +112,23 @@ def parse_extension_package_body_block(
    # (without braces)", so we just hack around this by adding braces.
    # This is only really workable because we only use this in a place
    # where the source contexts don't matter anyway.
-    return parse(qlgrammar.extension_package_body, f"{{{source}}}")
+    return parse(tokens.T_STARTEXTENSION, f"{{{source}}}")


 def parse_sdl(expr: str):
-    return parse(qlgrammar.sdldocument, expr)
+    return parse(tokens.T_STARTSDLDOCUMENT, expr)


 def parse(
-    grammar: types.ModuleType,
+    start_token: Type[tokens.Token],
    source: Union[str, qltokenizer.Source],
    filename: Optional[str] = None,
 ):
    if isinstance(source, str):
        source = qltokenizer.Source.from_string(source)

-    result, productions = rust_parser.parse(grammar.__name__, source.tokens())
+    start_token_name = start_token.__name__[2:]
+    result, productions = rust_parser.parse(start_token_name, source.tokens())

    if len(result.errors()) > 0:
        # TODO: emit multiple errors
@ -242,56 +239,14 @@ def _cst_to_ast(
    return result.pop()


-def _load_parser(grammar: str) -> None:
-    specmod = importlib.import_module(grammar)
-    parsing.load_parser_spec(specmod, allow_rebuild=True)
-
-
-def preload(
-    allow_rebuild: bool = True,
-    paralellize: bool = False,
-    grammars: Optional[list[types.ModuleType]] = None,
-) -> None:
-    if grammars is None:
-        grammars = [
-            qlgrammar.block,
-            qlgrammar.fragment,
-            qlgrammar.sdldocument,
-            qlgrammar.extension_package_body,
-            qlgrammar.migration_body,
-        ]
-
-    if not paralellize:
-        try:
-            for grammar in grammars:
-                spec = parsing.load_parser_spec(
-                    grammar, allow_rebuild=allow_rebuild)
-                rust_parser.cache_spec(grammar.__name__, spec)
-        except parsing.ParserSpecIncompatibleError as e:
-            raise errors.InternalServerError(e.args[0]) from None
-    else:
-        parsers_to_rebuild = []
-
-        for grammar in grammars:
-            try:
-                spec = parsing.load_parser_spec(grammar, allow_rebuild=False)
-                rust_parser.cache_spec(grammar.__name__, spec)
-            except parsing.ParserSpecIncompatibleError:
-                parsers_to_rebuild.append(grammar)
-
-        if len(parsers_to_rebuild) == 0:
-            pass
-        elif len(parsers_to_rebuild) == 1:
-            spec = parsing.load_parser_spec(
-                parsers_to_rebuild[0], allow_rebuild=True)
-            rust_parser.cache_spec(parsers_to_rebuild[0].__name__, spec)
+def preload(allow_rebuild: bool = False) -> None:
+    grammar = qlgrammar.start
+    try:
+        spec = parsing.load_parser_spec(grammar, allow_rebuild=False)
+    except parsing.ParserSpecIncompatibleError as e:
+        if allow_rebuild:
+            spec = parsing.load_parser_spec(grammar, allow_rebuild=True)
        else:
-            with multiprocessing.Pool(len(parsers_to_rebuild)) as pool:
-                pool.map(
-                    _load_parser,
-                    [mod.__name__ for mod in parsers_to_rebuild],
-                )
+            raise errors.InternalServerError(e.args[0]) from None

-            for grammar in parsers_to_rebuild:
-                spec = parsing.load_parser_spec(grammar, allow_rebuild=False)
-                rust_parser.cache_spec(grammar.__name__, spec)
+    rust_parser.cache_spec(spec)
--- a/edb/edgeql/parser/grammar/init.py
+++ b/edb/edgeql/parser/grammar/init.py
@ -8,8 +8,4 @@
 from __future__ import annotations


-from . import block as block  # noqa
-from . import extension_package_body as extension_package_body  # noqa
-from . import fragment as fragment  # noqa
-from . import migration_body as migration_body  # noqa
-from . import sdldocument as sdldocument  # noqa
+from . import start as start  # noqa
--- a/edb/edgeql/parser/grammar/block.py
+++ b/edb/edgeql/parser/grammar/block.py
@ -1,68 +0,0 @@
-#
-# This source file is part of the EdgeDB open source project.
-#
-# Copyright 2008-present MagicStack Inc. and the EdgeDB authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-
-from __future__ import annotations
-
-from edb.common import parsing
-
-from .expressions import Nonterm
-from .precedence import *  # NOQA
-from .tokens import *  # NOQA
-from .statements import *  # NOQA
-from .ddl import *  # NOQA
-from .session import *  # NOQA
-from .config import *  # NOQA
-
-
-class SingleStatement(Nonterm):
-    @parsing.inline(0)
-    def reduce_Stmt(self, _):
-        # Expressions
-        pass
-
-    @parsing.inline(0)
-    def reduce_DDLStmt(self, _):
-        # Data definition commands
-        pass
-
-    @parsing.inline(0)
-    def reduce_SessionStmt(self, _):
-        # Session-local utility commands
-        pass
-
-    @parsing.inline(0)
-    def reduce_ConfigStmt(self, _):
-        # Configuration commands
-        pass
-
-
-class StatementBlock(parsing.ListNonterm, element=SingleStatement,
-                     separator=Semicolons):  # NOQA, Semicolons are from .ddl
-    pass
-
-
-class EdgeQLBlock(Nonterm):
-    "%start"
-
-    @parsing.inline(0)
-    def reduce_StatementBlock_OptSemicolons_EOF(self, _, _semicolon, _eof):
-        pass
-
-    def reduce_OptSemicolons_EOF(self, _semicolon, _eof):
-        self.val = []
--- a/edb/edgeql/parser/grammar/extension_package_body.py
+++ b/edb/edgeql/parser/grammar/extension_package_body.py
@ -1,36 +0,0 @@
-#
-# This source file is part of the EdgeDB open source project.
-#
-# Copyright 2008-present MagicStack Inc. and the EdgeDB authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-
-from __future__ import annotations
-
-from edb.common import parsing
-
-from .expressions import Nonterm
-from .precedence import *  # NOQA
-from .tokens import *  # NOQA
-from .statements import *  # NOQA
-from .ddl import *  # NOQA
-
-
-class CreateExtensionPackageBody(Nonterm):
-    "%start"
-
-    @parsing.inline(0)
-    def reduce_CreateExtensionPackageCommandsBlock_EOF(self, *kids):
-        pass
--- a/edb/edgeql/parser/grammar/fragment.py
+++ b/edb/edgeql/parser/grammar/fragment.py
@ -1,39 +0,0 @@
-#
-# This source file is part of the EdgeDB open source project.
-#
-# Copyright 2023-present MagicStack Inc. and the EdgeDB authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-
-from __future__ import annotations
-
-from edb.common import parsing
-
-from .expressions import Nonterm
-from .expressions import *  # NOQA
-from .precedence import *  # NOQA
-from .tokens import *  # NOQA
-
-
-class ExpressionFragment(Nonterm):
-    "%start"
-
-    @parsing.inline(0)
-    def reduce_ExprStmt_EOF(self, *kids):
-        pass
-
-    @parsing.inline(0)
-    def reduce_Expr_EOF(self, *kids):
-        pass
--- a/edb/edgeql/parser/grammar/migration_body.py
+++ b/edb/edgeql/parser/grammar/migration_body.py
@ -1,36 +0,0 @@
-#
-# This source file is part of the EdgeDB open source project.
-#
-# Copyright 2008-present MagicStack Inc. and the EdgeDB authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-
-from __future__ import annotations
-
-from edb.common import parsing
-
-from .expressions import Nonterm
-from .precedence import *  # NOQA
-from .tokens import *  # NOQA
-from .statements import *  # NOQA
-from .ddl import *  # NOQA
-
-
-class CreateMigrationBody(Nonterm):
-    "%start"
-
-    @parsing.inline(0)
-    def reduce_CreateMigrationCommandsBlock_EOF(self, *kids):
-        pass
--- a/edb/edgeql/parser/grammar/sdldocument.py
+++ b/edb/edgeql/parser/grammar/sdldocument.py
@ -1,61 +0,0 @@
-#
-# This source file is part of the EdgeDB open source project.
-#
-# Copyright 2019-present MagicStack Inc. and the EdgeDB authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-
-from __future__ import annotations
-
-from edb.edgeql import ast as qlast
-
-from .expressions import Nonterm
-from .sdl import *  # NOQA
-
-from . import commondl
-
-
-class SDLDocument(Nonterm):
-    "%start"
-
-    def reduce_OptSemicolons_EOF(self, *kids):
-        self.val = qlast.Schema(declarations=[])
-
-    def reduce_statement_without_semicolons(self, *kids):
-        r"""%reduce \
-            OptSemicolons SDLShortStatement EOF
-        """
-        declarations = [kids[1].val]
-        commondl._validate_declarations(declarations)
-        self.val = qlast.Schema(declarations=declarations)
-
-    def reduce_statements_without_optional_trailing_semicolons(self, *kids):
-        r"""%reduce \
-            OptSemicolons SDLStatements \
-            OptSemicolons SDLShortStatement EOF
-        """
-        declarations = kids[1].val + [kids[3].val]
-        commondl._validate_declarations(declarations)
-        self.val = qlast.Schema(declarations=declarations)
-
-    def reduce_OptSemicolons_SDLStatements_EOF(self, *kids):
-        declarations = kids[1].val
-        commondl._validate_declarations(declarations)
-        self.val = qlast.Schema(declarations=declarations)
-
-    def reduce_OptSemicolons_SDLStatements_Semicolons_EOF(self, *kids):
-        declarations = kids[1].val
-        commondl._validate_declarations(declarations)
-        self.val = qlast.Schema(declarations=declarations)
--- a/edb/edgeql/parser/grammar/start.py
+++ b/edb/edgeql/parser/grammar/start.py
@ -0,0 +1,138 @@
+#
+# This source file is part of the EdgeDB open source project.
+#
+# Copyright 2008-present MagicStack Inc. and the EdgeDB authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+from __future__ import annotations
+
+from edb.common import parsing
+from edb.edgeql import ast as qlast
+
+from . import commondl
+from .expressions import Nonterm
+from .precedence import *  # NOQA
+from .tokens import *  # NOQA
+from .statements import *  # NOQA
+from .ddl import *  # NOQA
+from .session import *  # NOQA
+from .config import *  # NOQA
+
+
+# The main EdgeQL grammar, all of whose productions should start with a
+# GrammarToken, that determines the "subgrammar" to use.
+#
+# To add a new "subgrammar":
+# - add a new GrammarToken in tokens.py,
+# - add a new production here,
+# - add a new token kind in tokenizer.rs,
+# - add a mapping from the Python token name into the Rust token kind
+#   in parser.rs `fn get_token_kind`
+class EdgeQLGrammar(Nonterm):
+    "%start"
+
+    @parsing.inline(1)
+    def reduce_STARTBLOCK_EdgeQLBlock_EOF(self, *kids):
+        pass
+
+    @parsing.inline(1)
+    def reduce_STARTEXTENSION_CreateExtensionPackageCommandsBlock_EOF(self, *k):
+        pass
+
+    @parsing.inline(1)
+    def reduce_STARTMIGRATION_CreateMigrationCommandsBlock_EOF(self, *kids):
+        pass
+
+    @parsing.inline(1)
+    def reduce_STARTFRAGMENT_ExprStmt_EOF(self, *kids):
+        pass
+
+    @parsing.inline(1)
+    def reduce_STARTFRAGMENT_Expr_EOF(self, *kids):
+        pass
+
+    @parsing.inline(1)
+    def reduce_STARTSDLDOCUMENT_SDLDocument(self, *kids):
+        pass
+
+
+class EdgeQLBlock(Nonterm):
+    @parsing.inline(0)
+    def reduce_StatementBlock_OptSemicolons(self, _, _semicolon):
+        pass
+
+    def reduce_OptSemicolons(self, _semicolon):
+        self.val = []
+
+
+class SingleStatement(Nonterm):
+    @parsing.inline(0)
+    def reduce_Stmt(self, _):
+        # Expressions
+        pass
+
+    @parsing.inline(0)
+    def reduce_DDLStmt(self, _):
+        # Data definition commands
+        pass
+
+    @parsing.inline(0)
+    def reduce_SessionStmt(self, _):
+        # Session-local utility commands
+        pass
+
+    @parsing.inline(0)
+    def reduce_ConfigStmt(self, _):
+        # Configuration commands
+        pass
+
+
+class StatementBlock(
+    parsing.ListNonterm, element=SingleStatement, separator=commondl.Semicolons
+):  # NOQA, Semicolons are from .ddl
+    pass
+
+
+class SDLDocument(Nonterm):
+    def reduce_OptSemicolons_EOF(self, *kids):
+        self.val = qlast.Schema(declarations=[])
+
+    def reduce_statement_without_semicolons(self, *kids):
+        r"""%reduce \
+            OptSemicolons SDLShortStatement EOF
+        """
+        declarations = [kids[1].val]
+        commondl._validate_declarations(declarations)
+        self.val = qlast.Schema(declarations=declarations)
+
+    def reduce_statements_without_optional_trailing_semicolons(self, *kids):
+        r"""%reduce \
+            OptSemicolons SDLStatements \
+            OptSemicolons SDLShortStatement EOF
+        """
+        declarations = kids[1].val + [kids[3].val]
+        commondl._validate_declarations(declarations)
+        self.val = qlast.Schema(declarations=declarations)
+
+    def reduce_OptSemicolons_SDLStatements_EOF(self, *kids):
+        declarations = kids[1].val
+        commondl._validate_declarations(declarations)
+        self.val = qlast.Schema(declarations=declarations)
+
+    def reduce_OptSemicolons_SDLStatements_Semicolons_EOF(self, *kids):
+        declarations = kids[1].val
+        commondl._validate_declarations(declarations)
+        self.val = qlast.Schema(declarations=declarations)
--- a/edb/edgeql/parser/grammar/tokens.py
+++ b/edb/edgeql/parser/grammar/tokens.py
@ -42,6 +42,36 @@ class Token(parsing.Token, metaclass=TokenMeta,
    pass


+class GrammarToken(Token):
+    """
+    Instead of having different grammars, we prefix each query with a special
+    grammar token which directs the parser to appropriate grammar.
+
+    This greatly reduces the combined size of grammar specifications, since the
+    overlap between grammars is substantial.
+    """
+
+
+class T_STARTBLOCK(GrammarToken):
+    pass
+
+
+class T_STARTEXTENSION(GrammarToken):
+    pass
+
+
+class T_STARTFRAGMENT(GrammarToken):
+    pass
+
+
+class T_STARTMIGRATION(GrammarToken):
+    pass
+
+
+class T_STARTSDLDOCUMENT(GrammarToken):
+    pass
+
+
 class T_DOT(Token, lextoken='.'):
    pass

--- a/edb/server/compiler_pool/worker_proc.py
+++ b/edb/server/compiler_pool/worker_proc.py
@ -115,7 +115,7 @@ def main(get_handler):
    parser.add_argument("--version-serial", type=int)
    args = parser.parse_args()

-    ql_parser.preload(allow_rebuild=devmode.is_in_dev_mode(), paralellize=True)
+    ql_parser.preload(allow_rebuild=devmode.is_in_dev_mode())
    gc.freeze()

    listen_for_debugger()
--- a/edb/server/main.py
+++ b/edb/server/main.py
@ -180,16 +180,8 @@ async def _init_cluster(
 def _init_parsers():
    # Initialize parsers that are used in the server process.
    from edb.edgeql import parser as ql_parser
-    from edb.edgeql.parser import grammar as ql_grammar

-    ql_parser.preload(
-        allow_rebuild=devmode.is_in_dev_mode(),
-        paralellize=True,
-        grammars=[
-            ql_grammar.block,
-            ql_grammar.fragment,
-        ]
-    )
+    ql_parser.preload(allow_rebuild=devmode.is_in_dev_mode())


 async def _run_server(
--- a/edb/testbase/lang.py
+++ b/edb/testbase/lang.py
@ -22,7 +22,6 @@
 from __future__ import annotations
 from typing import *

-import types
 import typing
 import functools
 import os
@ -176,12 +175,29 @@ class BaseDocTest(unittest.TestCase, metaclass=DocTestMeta):
        )


-class BaseSyntaxTest(BaseDocTest):
+class PreloadParserGrammarMixin:
+    pass
+
+
+def should_preload_parser(
+    cases: Iterable[unittest.TestCase],
+) -> bool:
+    for cas in cases:
+        if isinstance(cas, PreloadParserGrammarMixin):
+            return True
+    return False
+
+
+def preload_parser() -> None:
+    qlparser.preload(allow_rebuild=True)
+
+
+class BaseSyntaxTest(BaseDocTest, PreloadParserGrammarMixin):
    ast_to_source: Optional[Any] = None
    markup_dump_lexer: Optional[str] = None

    @classmethod
-    def get_grammar(cls):
+    def get_grammar_token(cls) -> Type[qlgrammar.tokens.GrammarToken]:
        raise NotImplementedError

    def run_test(self, *, source, spec, expected=None):
@ -189,7 +205,7 @@ class BaseSyntaxTest(BaseDocTest):
        if debug:
            markup.dump_code(source, lexer=self.markup_dump_lexer)

-        inast = qlparser.parse(self.get_grammar(), source)
+        inast = qlparser.parse(self.get_grammar_token(), source)

        if debug:
            markup.dump(inast)
@ -207,59 +223,6 @@ class BaseSyntaxTest(BaseDocTest):
        self.assert_equal(expected_src, processed_src)


-class TestCasesSetup:
-    def __init__(self, grammars: list[types.ModuleType]) -> None:
-        self.grammars = grammars
-
-
-def get_test_cases_setup(
-    cases: Iterable[unittest.TestCase],
-) -> Optional[TestCasesSetup]:
-    grammars: List[types.ModuleType] = []
-
-    for case in cases:
-        if not hasattr(case, 'get_grammar'):
-            continue
-
-        grammar = case.get_grammar()
-        if not grammar:
-            continue
-        elif isinstance(grammar, list):
-            grammars.extend(grammar)
-        else:
-            grammars.append(grammar)
-
-    if not grammars:
-        return None
-    else:
-        return TestCasesSetup(set(grammars))
-
-
-def run_test_cases_setup(setup: TestCasesSetup, jobs: int) -> None:
-    qlparser.preload(
-        grammars=setup.grammars,
-        allow_rebuild=True,
-        paralellize=jobs > 1,
-    )
-
-
-class AstValueTest(BaseDocTest):
-    def run_test(self, *, source, spec=None, expected=None):
-        debug = bool(os.environ.get(self.parser_debug_flag))
-        if debug:
-            markup.dump_code(source, lexer=self.markup_dump_lexer)
-
-        inast = qlparser.parse(self.get_grammar(), source)
-
-        if debug:
-            markup.dump(inast)
-
-        for var in inast.definitions[0].variables:
-            asttype, val = expected[var.name]
-            self.assertIsInstance(var.value, asttype)
-            self.assertEqual(var.value.value, val)
-
-
 _std_schema = None
 _refl_schema = None
 _schema_class_layout = None
@ -338,7 +301,7 @@ def new_compiler():
    )


-class BaseSchemaTest(BaseDocTest):
+class BaseSchemaTest(BaseDocTest, PreloadParserGrammarMixin):
    DEFAULT_MODULE = 'default'
    SCHEMA: Optional[str] = None

@ -352,15 +315,6 @@ class BaseSchemaTest(BaseDocTest):
        else:
            cls.schema = _load_std_schema()

-    @classmethod
-    def get_grammar(cls):
-        return [
-            qlgrammar.block,
-            qlgrammar.fragment,
-            qlgrammar.sdldocument,
-            qlgrammar.extension_package_body,
-        ]
-
    @classmethod
    def run_ddl(cls, schema, ddl, default_module=defines.DEFAULT_MODULE_ALIAS):
        statements = edgeql.parse_block(ddl)
--- a/edb/tools/docs/eql.py
+++ b/edb/tools/docs/eql.py
@ -726,7 +726,7 @@ class EQLFunctionDirective(BaseEQLDirective):

        try:
            astnode = edgeql_parser.parse(
-                edgeql_grammar.block,
+                edgeql_grammar.tokens.T_STARTBLOCK,
                f'create function {sig} using SQL function "xxx";')[0]
        except Exception as ex:
            raise self.error(
@ -800,8 +800,9 @@ class EQLConstraintDirective(BaseEQLDirective):

        try:
            astnode = edgeql_parser.parse(
-                edgeql_grammar.block,
-                f'create abstract constraint {sig};')[0]
+                edgeql_grammar.tokens.T_STARTBLOCK,
+                f'create abstract constraint {sig};'
+            )[0]
        except Exception as ex:
            raise self.error(
                f'could not parse constraint signature {sig!r}') from ex
--- a/edb/tools/parser_demo.py
+++ b/edb/tools/parser_demo.py
@ -21,7 +21,7 @@ from typing import *
 from edb.edgeql import ast as qlast
 from edb.edgeql import tokenizer
 from edb.edgeql import parser as qlparser
-from edb.edgeql.parser import grammar as qlgrammar
+from edb.edgeql.parser.grammar import tokens as qltokens

 import edb._edgeql_parser as rust_parser

@ -30,7 +30,7 @@ from edb.tools.edb import edbcommands

@edbcommands.command("parser-demo")
 def main():
-    for q in QUERIES:
+    for q in QUERIES[-10:]:
        sdl = q.startswith('sdl')
        if sdl:
            q = q[3:]
@ -43,9 +43,10 @@ def main():
            print(e)
            continue

-        grammar = qlgrammar.sdldocument if sdl else qlgrammar.block
+        start_t = qltokens.T_STARTSDLDOCUMENT if sdl else qltokens.T_STARTBLOCK
+        start_t_name = start_t.__name__[2:]
        tokens = source.tokens()
-        result, productions = rust_parser.parse(grammar.__name__, tokens)
+        result, productions = rust_parser.parse(start_t_name, tokens)

        print('-' * 30)
        print()
@ -313,4 +314,7 @@ QUERIES = [
    '''
    SELECT INTROSPECT tuple<int64>;
    ''',
+    '''
+    (SELECT User.name) OFFSET 2;
+    ''',
 ]
--- a/edb/tools/test/runner.py
+++ b/edb/tools/test/runner.py
@ -824,7 +824,7 @@ class ParallelTextTestRunner:
        )
        setup = tb.get_test_cases_setup(cases)
        server_used = tb.test_cases_use_server(cases)
-        lang_setup = tb_lang.get_test_cases_setup(cases)
+        preload_parser = tb_lang.should_preload_parser(cases)
        bootstrap_time_taken = 0
        tests_time_taken = 0
        result = None
@ -863,8 +863,8 @@ class ParallelTextTestRunner:

                os.environ["EDGEDB_SERVER_JWS_KEY_FILE"] = str(jwk_file)

-        if lang_setup:
-            tb_lang.run_test_cases_setup(lang_setup, jobs=self.num_workers)
+        if preload_parser:
+            tb_lang.preload_parser()

        try:
            if setup:
--- a/setup.py
+++ b/setup.py
@ -841,11 +841,7 @@ class build_parsers(setuptools.Command):
         'alongside your pure Python modules')]

    sources = [
-        "edb.edgeql.parser.grammar.block",
-        "edb.edgeql.parser.grammar.fragment",
-        "edb.edgeql.parser.grammar.sdldocument",
-        "edb.edgeql.parser.grammar.migration_body",
-        "edb.edgeql.parser.grammar.extension_package_body",
+        "edb.edgeql.parser.grammar.start",
    ]

    def initialize_options(self):
--- a/tests/test_edgeql_syntax.py
+++ b/tests/test_edgeql_syntax.py
@ -24,7 +24,7 @@ from edb import errors

 from edb.testbase import lang as tb
 from edb.edgeql import generate_source as edgeql_to_source
-from edb.edgeql.parser import grammar as edgeql_grammar
+from edb.edgeql.parser import grammar as qlgrammar
 from edb.tools import test


@ -35,8 +35,8 @@ class EdgeQLSyntaxTest(tb.BaseSyntaxTest):
    ast_to_source = edgeql_to_source

    @classmethod
-    def get_grammar(cls):
-        return edgeql_grammar.block
+    def get_grammar_token(cls):
+        return qlgrammar.tokens.T_STARTBLOCK


 class TestEdgeQLParser(EdgeQLSyntaxTest):
--- a/tests/test_schema_syntax.py
+++ b/tests/test_schema_syntax.py
@ -24,7 +24,7 @@ from edb import errors

 from edb.testbase import lang as tb
 from edb.edgeql import generate_source
-from edb.edgeql.parser import grammar as ql_grammar
+from edb.edgeql.parser import grammar as qlgrammar
 from edb.tools import test


@ -35,8 +35,8 @@ class SchemaSyntaxTest(tb.BaseSyntaxTest):
    ast_to_source = functools.partial(generate_source, unsorted=True)

    @classmethod
-    def get_grammar(cls):
-        return ql_grammar.sdldocument
+    def get_grammar_token(cls):
+        return qlgrammar.tokens.T_STARTSDLDOCUMENT


 class TestEdgeSchemaParser(SchemaSyntaxTest):