diff --git a/CHANGELOG b/CHANGELOG index edfedb4f..41f77d74 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -33,6 +33,8 @@ Bug Fixes * Fix statement splitting (issue845). * Fix a late-binding closure bug in `TokenList.token_not_matching`. +* Preserve trailing whitespace after the final ``;`` so that + ``str(parse(sql)) == sql`` holds for input ending in a newline. Release 0.5.5 (Dec 19, 2025) diff --git a/sqlparse/engine/statement_splitter.py b/sqlparse/engine/statement_splitter.py index bc57d170..3b793e6d 100644 --- a/sqlparse/engine/statement_splitter.py +++ b/sqlparse/engine/statement_splitter.py @@ -152,6 +152,14 @@ def process(self, stream): """Process the stream""" EOS_TTYPE = T.Whitespace, T.Comment.Single + # A finished statement is held back for one segment instead of being + # yielded immediately. This lets whitespace that turns out to trail the + # whole input be reattached to the statement it follows, rather than + # being split off into a dangling all-whitespace buffer that is dropped + # at end of stream (which silently broke ``str(parse(sql)) == sql`` for + # any input ending in a newline after ``;``). + held_tokens = None + # Run over all stream tokens for ttype, value in stream: # Yield token if we finished a statement and there's no whitespaces @@ -159,7 +167,12 @@ def process(self, stream): # whitespace ignores newlines. # why don't multi line comments also count? if self.consume_ws and ttype not in EOS_TTYPE: - yield sql.Statement(self.tokens) + # A new statement starts here, so the previously held one is + # now known to be complete (its trailing whitespace, if any, + # already leads this new statement) and can be emitted. + if held_tokens is not None: + yield sql.Statement(held_tokens) + held_tokens = self.tokens # Reset filter and prepare to process next statement self._reset() @@ -191,6 +204,17 @@ def process(self, stream): # token but not for BEGIN itself (which just set the flag) self._seen_begin = False + # Flush the held statement and whatever remains. Any trailing tokens + # left in ``self.tokens`` after the last statement was completed are + # pure whitespace (the split was armed by ``consume_ws``); reattach + # them to that statement so the exact input is preserved on join, + # instead of dropping them as a dangling all-whitespace buffer. + if held_tokens is not None: + if self.tokens and all(t.is_whitespace for t in self.tokens): + held_tokens = held_tokens + self.tokens + self.tokens = [] + yield sql.Statement(held_tokens) + # Yield pending statement (if any) if self.tokens and not all(t.is_whitespace for t in self.tokens): yield sql.Statement(self.tokens) diff --git a/tests/test_split.py b/tests/test_split.py index 92c3fefe..47ad6055 100644 --- a/tests/test_split.py +++ b/tests/test_split.py @@ -152,6 +152,21 @@ def test_split_ignores_empty_newlines(): assert stmts[1] == 'select bar;' +@pytest.mark.parametrize('s', ['select 1;\n', + 'select 1;\r\n', + 'select 1;\n\n', + 'select 1;\n ', + 'select 1; \n', + 'select 1;\nselect 2;\n', + ';\n']) +def test_split_preserves_trailing_whitespace(s): + # parse() must be lossless: whitespace following the final ';' was being + # split into a dangling all-whitespace statement and dropped, so joining + # the parsed statements no longer reproduced the input whenever it ended + # in a newline after ';' (trailing spaces alone were already preserved). + assert ''.join(str(stmt) for stmt in sqlparse.parse(s)) == s + + def test_split_quotes_with_new_line(): stmts = sqlparse.split('select "foo\nbar"') assert len(stmts) == 1