Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .genignore
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,7 @@ src/unstructured_client/users.py
# - Adjust the custom url snippets in the file
# - Bring back the ignore line and commit
src/unstructured_client/general.py

# Custom min_attempts / absolute_max_elapsed_time_ms fields on BackoffStrategy.
# Push upstream to Speakeasy templates to remove this entry.
src/unstructured_client/utils/retries.py
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
### Breaking changes
* Removed deprecated connector config models from the SDK (e.g. `S3SourceConnectorConfig`, `AzureDestinationConnectorConfig`). Pass connector configs as plain dicts with arbitrary fields. The SDK is no longer coupled to backend connector schemas — new fields work without an SDK upgrade.

### Features
* Add `min_attempts` and `absolute_max_elapsed_time_ms` fields to `BackoffStrategy`. `min_attempts` is the minimum number of retry attempts that must fire before `max_elapsed_time` is honored; defaults to `0` (preserves existing behavior). `absolute_max_elapsed_time_ms` caps when a new retry can start (does not interrupt in-flight requests); defaults to `None`. Together these close a short-circuit where a single slow first attempt could exhaust the retry budget before any retry fired. See FS-1988.

## 0.43.4

### Enhancements
Expand Down
42 changes: 42 additions & 0 deletions _test_unstructured_client/integration/test_decorators.py
Original file line number Diff line number Diff line change
Expand Up @@ -813,3 +813,45 @@ async def mock_send(_, request: httpx.Request, **kwargs):
assert number_of_transport_failures == 0
assert mock_endpoint_called
assert res.status_code == 200


def test_split_pdf_cache_tmp_data_chunk_request_stream_is_replay_safe(tmp_path):
from unstructured_client._hooks.custom.request_utils import (
create_pdf_chunk_request,
)

chunk_path = tmp_path / "chunk.pdf"
src_bytes = Path("_sample_docs/layout-parser-paper.pdf").read_bytes()
chunk_path.write_bytes(src_bytes)

pdf_chunk_file = open(chunk_path, "rb") # noqa: SIM115
try:
form_data = {
"files": (chunk_path.name, src_bytes, "application/pdf"),
"strategy": "fast",
}
original_request = httpx.Request(
method="POST",
url="http://localhost:8000/general/v0/general",
headers={
"Content-Type": "multipart/form-data; boundary=test",
"User-Agent": "test",
},
content=b"",
)

chunk_request = create_pdf_chunk_request(
form_data=form_data,
pdf_chunk=(pdf_chunk_file, 1),
original_request=original_request,
filename=chunk_path.name,
)

# Iterate twice without request.read() to bypass _content caching.
first_pass = b"".join(chunk_request.stream)
second_pass = b"".join(chunk_request.stream)

assert len(first_pass) > 1000
assert first_pass == second_pass
finally:
pdf_chunk_file.close()
Loading