From c361801842f1d35af9b2ff8981669bbde8a068b1 Mon Sep 17 00:00:00 2001 From: Alexander Verbitsky Date: Thu, 8 Aug 2024 14:58:20 +0200 Subject: [PATCH] Stub files for fastwarc (#44) --- fastwarc/fastwarc/stream_io.pyi | 72 ++++++++++++++++++++++++++ fastwarc/fastwarc/tools.pyi | 48 ++++++++++++++++++ fastwarc/fastwarc/warc.pyi | 90 +++++++++++++++++++++++++++++++++ fastwarc/pyproject.toml | 3 ++ 4 files changed, 213 insertions(+) create mode 100644 fastwarc/fastwarc/stream_io.pyi create mode 100644 fastwarc/fastwarc/tools.pyi create mode 100644 fastwarc/fastwarc/warc.pyi diff --git a/fastwarc/fastwarc/stream_io.pyi b/fastwarc/fastwarc/stream_io.pyi new file mode 100644 index 00000000..4b39f212 --- /dev/null +++ b/fastwarc/fastwarc/stream_io.pyi @@ -0,0 +1,72 @@ +from typing import ContextManager, IO + + +class IOStream(ContextManager): + def read(self, size: int) -> bytes: ... + def write(self, data: bytes) -> int: ... + def close(self) -> None: ... + def flush(self) -> None: ... + def seek(self, offset: int) -> None: ... + def tell(self) -> int: ... + + +class BufferedReader: + def __init__( + self, stream: IOStream, buf_size: int = 8192, negotiate_stream: bool = True + ) -> None: ... + def close(self) -> None: ... + def consume(self, size: int = -1) -> int: ... + def read(self, size: int = -1) -> bytes: ... + def readline(self, crlf: bool = True, max_line_len: int = 8192) -> bytes: ... + def tell(self) -> int: ... + + +class BytesIOStream(IOStream): + def getvalue(self) -> bytes: ... + + +class FileStream(IOStream): + def __init__(self, filename: str, mode: str = "rb") -> None: ... + + +class CompressingStream(IOStream): + def begin_member(self) -> int: ... + def end_member(self) -> int: ... + + +class BrotliStream(CompressingStream): + def __init__( + self, raw_stream: IOStream, quality: int = 11, lgwin: int = 22, lgblock: int = 0 + ) -> None: ... + + +class GZipStream(CompressingStream): + def __init__( + self, raw_stream: IOStream, compression_level: int = 9, zlib: bool = False + ) -> None: ... + + +class LZ4Stream(CompressingStream): + def __init__( + self, + raw_stream: IOStream, + compression_level: int = 12, + favor_dec_speed: bool = True, + ) -> None: ... + def prepopulate(self, initial_data: bytes) -> None: ... + + +class PythonIOStreamAdapter(IOStream): + def __init__(self, py_stream: IO) -> None: ... + + +class FastWARCError(Exception): + pass + + +class ReaderStaleError(FastWARCError): + pass + + +class StreamError(FastWARCError): + pass diff --git a/fastwarc/fastwarc/tools.pyi b/fastwarc/fastwarc/tools.pyi new file mode 100644 index 00000000..b51fe5d8 --- /dev/null +++ b/fastwarc/fastwarc/tools.pyi @@ -0,0 +1,48 @@ +from enum import IntFlag +from typing import Union, Type, Iterator, Tuple + +from .stream_io import IOStream +from .warc import WarcRecord + + +class CompressionAlg(IntFlag): + gzip = 0 + lz4 = 1 + uncompressed = 2 + auto = 3 + + +def detect_compression_algorithm(file: str) -> CompressionAlg: ... + + +def wrap_warc_stream( + file: Union[str, Type[IOStream]], + mode: str, + comp_alg: CompressionAlg = CompressionAlg.auto, + **comp_args +) -> Type[IOStream]: ... + + +def recompress_warc_interactive( + warc_in: Union[str, Type[IOStream]], + warc_out: Union[str, Type[IOStream]], + comp_alg_in: CompressionAlg = CompressionAlg.auto, + comp_alg_out: CompressionAlg = CompressionAlg.auto, + **comp_args +) -> Iterator[Tuple[WarcRecord, int]]: ... + + +def recompress_warc( + warc_in: Union[str, Type[IOStream]], + warc_out: Union[str, Type[IOStream]], + comp_alg_in: CompressionAlg = CompressionAlg.auto, + comp_alg_out: CompressionAlg = CompressionAlg.auto, + **comp_args +) -> Iterator[Tuple[WarcRecord, int]]: ... + + +def verify_digests( + warc_in: Union[str, Type[IOStream]], + verify_payloads: bool = False, + comp_alg: CompressionAlg = CompressionAlg.auto, +) -> bool: ... diff --git a/fastwarc/fastwarc/warc.pyi b/fastwarc/fastwarc/warc.pyi new file mode 100644 index 00000000..e7a860fe --- /dev/null +++ b/fastwarc/fastwarc/warc.pyi @@ -0,0 +1,90 @@ +from datetime import datetime +from typing import ( + Optional, + Iterator, + Dict, + Tuple, + MutableMapping, + Iterable, + ValuesView, + KeysView, + Type, +) +from enum import IntFlag + +from .stream_io import BufferedReader, IOStream + + +class WarcRecordType(IntFlag): + warcinfo = 2 + response = 4 + resource = 8 + request = 16 + metadata = 32 + revisit = 64 + conversion = 128 + continuation = 256 + unknown = 512 + any_type = 65535 + no_type = 0 + + +no_type = WarcRecordType.no_type +any_type = WarcRecordType.any_type + + +class WarcHeaderMap(MutableMapping[str, str]): + reason_phrase: Optional[str] + status_code: Optional[str] + status_line: str + + def append(self, key: str, value: str) -> None: ... + def asdict(self) -> Dict[str, str]: ... + def astuples(self) -> Tuple[str, str]: ... + def clear(self) -> None: ... + def get(self, key: str, default: Optional[str] = None) -> Optional[str]: ... + def items(self) -> Iterator[Tuple[str, str]]: ... + def keys(self) -> KeysView[str]: ... + def values(self) -> ValuesView[str]: ... + def write(self, stream: IOStream) -> None: ... + + +class WarcRecord: + record_id: str + record_type: WarcRecordType + content_length: int + record_date: Optional[datetime] + headers: WarcHeaderMap + is_http: bool + is_http_parsed: bool + http_headers: Optional[WarcHeaderMap] + http_content_type: Optional[str] + http_content_type: Optional[str] + http_charset: Optional[str] + http_date: Optional[datetime] + http_last_modified: Optional[datetime] + content_length: int + reader: BufferedReader + stream_pos: int + + def init_headers( + self, content_length: int = 0, record_type=no_type, record_urn=None + ): ... + def freeze(self) -> bool: ... + def set_bytes_content(self, content: bytes) -> None: ... + def parse_http(self, strict_mode=True, auto_decode: str = "none") -> None: ... + def verify_block_digest(self, consume: bool = False) -> bool: ... + def verify_payload_digest(self, consume: bool = False) -> bool: ... + + +class ArchiveIterator(Iterable[WarcRecord]): + def __init__( + self, + stream: Type[IOStream], + record_types: WarcRecordType = any_type, + parse_http: bool = True, + min_content_length: int = -1, + max_content_length: int = -1, + ) -> None: ... + def __iter__(self) -> Iterator[WarcRecord]: ... + def __next__(self) -> WarcRecord: ... diff --git a/fastwarc/pyproject.toml b/fastwarc/pyproject.toml index 454576ad..078da507 100644 --- a/fastwarc/pyproject.toml +++ b/fastwarc/pyproject.toml @@ -34,6 +34,9 @@ test = [ [tool.setuptools.packages.find] include = ["fastwarc*"] +[tool.setuptools.package-data] +"*" = ["*.pyi"] + [tool.cibuildwheel] archs = "native" build = "cp3*"