Skip to content

Commit 8c80513

Browse files
committed
Update file utils
1 parent 4c05cee commit 8c80513

File tree

2 files changed

+22
-41
lines changed

2 files changed

+22
-41
lines changed

src/crawlee/_utils/file.py

Lines changed: 21 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -19,30 +19,21 @@
1919

2020
if sys.platform == 'win32':
2121

22-
def _write_file(
23-
path: Path,
24-
data: str | bytes,
25-
*,
26-
is_binary: bool,
27-
) -> str | None:
22+
def _write_file(path: Path, data: str | bytes) -> None:
2823
"""Windows-specific file write implementation.
2924
3025
This implementation writes directly to the file without using a temporary file, because
3126
they are problematic due to permissions issues on Windows.
3227
"""
33-
if is_binary:
34-
path.write_bytes(data) # type: ignore[arg-type]
28+
if isinstance(data, bytes):
29+
path.write_bytes(data)
30+
elif isinstance(data, str):
31+
path.write_text(data, encoding='utf-8')
3532
else:
36-
path.write_text(data, encoding='utf-8') # type: ignore[arg-type]
37-
return None
33+
raise TypeError(f'Unsupported data type: {type(data)}. Expected str or bytes.')
3834
else:
3935

40-
def _write_file(
41-
path: Path,
42-
data: str | bytes,
43-
*,
44-
is_binary: bool,
45-
) -> str | None:
36+
def _write_file(path: Path, data: str | bytes) -> None:
4637
"""Linux/Unix-specific file write implementation using temporary files."""
4738
dir_path = path.parent
4839
fd, tmp_path = tempfile.mkstemp(
@@ -51,17 +42,22 @@ def _write_file(
5142
dir=str(dir_path),
5243
)
5344

45+
if not isinstance(data, (str, bytes)):
46+
raise TypeError(f'Unsupported data type: {type(data)}. Expected str or bytes.')
47+
5448
try:
55-
if is_binary:
49+
if isinstance(data, bytes):
5650
with os.fdopen(fd, 'wb') as tmp_file:
57-
tmp_file.write(data) # type: ignore[arg-type]
51+
tmp_file.write(data)
5852
else:
5953
with os.fdopen(fd, 'w', encoding='utf-8') as tmp_file:
60-
tmp_file.write(data) # type: ignore[arg-type]
54+
tmp_file.write(data)
55+
56+
# Atomically replace the destination file with the temporary file
57+
Path(tmp_path).replace(path)
6158
except Exception:
6259
Path(tmp_path).unlink(missing_ok=True)
6360
raise
64-
return tmp_path
6561

6662

6763
def infer_mime_type(value: Any) -> str:
@@ -106,7 +102,6 @@ async def atomic_write(
106102
path: Path,
107103
data: str,
108104
*,
109-
is_binary: bool = False,
110105
retry_count: int = 0,
111106
) -> None: ...
112107

@@ -116,7 +111,6 @@ async def atomic_write(
116111
path: Path,
117112
data: bytes,
118113
*,
119-
is_binary: bool = True,
120114
retry_count: int = 0,
121115
) -> None: ...
122116

@@ -125,48 +119,35 @@ async def atomic_write(
125119
path: Path,
126120
data: str | bytes,
127121
*,
128-
is_binary: bool = False,
129122
retry_count: int = 0,
130123
) -> None:
131124
"""Write data to a file atomically to prevent data corruption or partial writes.
132125
133-
This function handles both text and binary data. It ensures atomic writing by creating
134-
a temporary file and then atomically replacing the target file, which prevents data
135-
corruption if the process is interrupted during the write operation.
126+
This function handles both text and binary data. The binary mode is automatically
127+
detected based on the data type (bytes = binary, str = text). It ensures atomic
128+
writing by creating a temporary file and then atomically replacing the target file,
129+
which prevents data corruption if the process is interrupted during the write operation.
136130
137131
Args:
138132
path: The path to the destination file.
139133
data: The data to write to the file (string or bytes).
140-
is_binary: If True, write in binary mode. If False (default), write in text mode.
141134
retry_count: Internal parameter to track the number of retry attempts (default: 0).
142135
"""
143136
max_retries = 3
144-
tmp_path: str | None = None
145137

146138
try:
147139
# Use the platform-specific write function resolved at import time.
148-
tmp_path = await asyncio.to_thread(_write_file, path, data, is_binary=is_binary)
149-
150-
# On Linux/Unix, replace the destination file with tmp file.
151-
if tmp_path is not None:
152-
await asyncio.to_thread(os.replace, tmp_path, str(path))
140+
await asyncio.to_thread(_write_file, path, data)
153141
except (FileNotFoundError, PermissionError):
154142
if retry_count < max_retries:
155-
if tmp_path is not None:
156-
await asyncio.to_thread(Path(tmp_path).unlink, missing_ok=True)
157143
return await atomic_write(
158144
path,
159145
data,
160-
is_binary=is_binary,
161146
retry_count=retry_count + 1,
162147
)
163148
# If we reach the maximum number of retries, raise the exception.
164149
raise
165150

166-
finally:
167-
if tmp_path is not None:
168-
await asyncio.to_thread(Path(tmp_path).unlink, missing_ok=True)
169-
170151

171152
async def export_json_to_stream(
172153
iterator: AsyncIterator[dict[str, Any]],

src/crawlee/storage_clients/_file_system/_key_value_store_client.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -321,7 +321,7 @@ async def set_value(self, *, key: str, value: Any, content_type: str | None = No
321321
await asyncio.to_thread(self.path_to_kvs.mkdir, parents=True, exist_ok=True)
322322

323323
# Write the value to the file.
324-
await atomic_write(record_path, value_bytes, is_binary=True)
324+
await atomic_write(record_path, value_bytes)
325325

326326
# Write the record metadata to the file.
327327
await atomic_write(record_metadata_filepath, record_metadata_content)

0 commit comments

Comments
 (0)