From ad97c6fddbf9f6455ed94b49e745a238e21fb9da Mon Sep 17 00:00:00 2001 From: Sanjay Santhanam <51058514+Sanjays2402@users.noreply.github.com> Date: Thu, 2 Jul 2026 21:21:13 -0700 Subject: [PATCH] Preserve trailing whitespace after the final semicolon parse() is meant to be lossless: joining the returned statements should reproduce the input exactly. This held for trailing spaces/tabs after the last ";" but not for trailing whitespace containing a newline -- str(parse("select 1;\n")) was "select 1;", dropping the "\n". Root cause is in StatementSplitter.process(). Once a ";" arms consume_ws, a following newline token is deliberately treated as "not whitespace" for end-of-statement detection (so that "a;\nb;" splits into two statements), which starts a new statement buffer for the trailing whitespace. When that whitespace is at the very end of the input there is no following statement, so the buffer stays all-whitespace and is discarded by the final "not all whitespace" guard -- silently losing the exact input. Fix: hold each completed statement back by one segment. A held statement is emitted only once the next real (non-whitespace) token confirms a new statement has begun -- so inter-statement newline placement is byte-for-byte unchanged ("a;\nb;" still yields ["a;", "\nb;"]). At end of stream, any leftover all-whitespace tokens are reattached to the held statement instead of being dropped, making the trailing case lossless too. Whitespace-only input still yields zero statements, and split() is unaffected because it strips each statement. Verified with a 200k-iteration round-trip fuzzer (str(parse(sql)) == sql): 31+ failures before, 0 after; full suite 494 passed. Add parametrized regression test test_split_preserves_trailing_whitespace covering "\n", "\r\n", multiple/mixed trailing whitespace, the multi- statement case and a bare ";\n"; it fails on all 7 cases without the fix. --- CHANGELOG | 2 ++ sqlparse/engine/statement_splitter.py | 26 +++++++++++++++++++++++++- tests/test_split.py | 15 +++++++++++++++ 3 files changed, 42 insertions(+), 1 deletion(-) diff --git a/CHANGELOG b/CHANGELOG index edfedb4f..41f77d74 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -33,6 +33,8 @@ Bug Fixes * Fix statement splitting (issue845). * Fix a late-binding closure bug in `TokenList.token_not_matching`. +* Preserve trailing whitespace after the final ``;`` so that + ``str(parse(sql)) == sql`` holds for input ending in a newline. Release 0.5.5 (Dec 19, 2025) diff --git a/sqlparse/engine/statement_splitter.py b/sqlparse/engine/statement_splitter.py index bc57d170..3b793e6d 100644 --- a/sqlparse/engine/statement_splitter.py +++ b/sqlparse/engine/statement_splitter.py @@ -152,6 +152,14 @@ def process(self, stream): """Process the stream""" EOS_TTYPE = T.Whitespace, T.Comment.Single + # A finished statement is held back for one segment instead of being + # yielded immediately. This lets whitespace that turns out to trail the + # whole input be reattached to the statement it follows, rather than + # being split off into a dangling all-whitespace buffer that is dropped + # at end of stream (which silently broke ``str(parse(sql)) == sql`` for + # any input ending in a newline after ``;``). + held_tokens = None + # Run over all stream tokens for ttype, value in stream: # Yield token if we finished a statement and there's no whitespaces @@ -159,7 +167,12 @@ def process(self, stream): # whitespace ignores newlines. # why don't multi line comments also count? if self.consume_ws and ttype not in EOS_TTYPE: - yield sql.Statement(self.tokens) + # A new statement starts here, so the previously held one is + # now known to be complete (its trailing whitespace, if any, + # already leads this new statement) and can be emitted. + if held_tokens is not None: + yield sql.Statement(held_tokens) + held_tokens = self.tokens # Reset filter and prepare to process next statement self._reset() @@ -191,6 +204,17 @@ def process(self, stream): # token but not for BEGIN itself (which just set the flag) self._seen_begin = False + # Flush the held statement and whatever remains. Any trailing tokens + # left in ``self.tokens`` after the last statement was completed are + # pure whitespace (the split was armed by ``consume_ws``); reattach + # them to that statement so the exact input is preserved on join, + # instead of dropping them as a dangling all-whitespace buffer. + if held_tokens is not None: + if self.tokens and all(t.is_whitespace for t in self.tokens): + held_tokens = held_tokens + self.tokens + self.tokens = [] + yield sql.Statement(held_tokens) + # Yield pending statement (if any) if self.tokens and not all(t.is_whitespace for t in self.tokens): yield sql.Statement(self.tokens) diff --git a/tests/test_split.py b/tests/test_split.py index 92c3fefe..47ad6055 100644 --- a/tests/test_split.py +++ b/tests/test_split.py @@ -152,6 +152,21 @@ def test_split_ignores_empty_newlines(): assert stmts[1] == 'select bar;' +@pytest.mark.parametrize('s', ['select 1;\n', + 'select 1;\r\n', + 'select 1;\n\n', + 'select 1;\n ', + 'select 1; \n', + 'select 1;\nselect 2;\n', + ';\n']) +def test_split_preserves_trailing_whitespace(s): + # parse() must be lossless: whitespace following the final ';' was being + # split into a dangling all-whitespace statement and dropped, so joining + # the parsed statements no longer reproduced the input whenever it ended + # in a newline after ';' (trailing spaces alone were already preserved). + assert ''.join(str(stmt) for stmt in sqlparse.parse(s)) == s + + def test_split_quotes_with_new_line(): stmts = sqlparse.split('select "foo\nbar"') assert len(stmts) == 1