-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
213 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
from typing import ContextManager, IO | ||
|
||
|
||
class IOStream(ContextManager): | ||
def read(self, size: int) -> bytes: ... | ||
def write(self, data: bytes) -> int: ... | ||
def close(self) -> None: ... | ||
def flush(self) -> None: ... | ||
def seek(self, offset: int) -> None: ... | ||
def tell(self) -> int: ... | ||
|
||
|
||
class BufferedReader: | ||
def __init__( | ||
self, stream: IOStream, buf_size: int = 8192, negotiate_stream: bool = True | ||
) -> None: ... | ||
def close(self) -> None: ... | ||
def consume(self, size: int = -1) -> int: ... | ||
def read(self, size: int = -1) -> bytes: ... | ||
def readline(self, crlf: bool = True, max_line_len: int = 8192) -> bytes: ... | ||
def tell(self) -> int: ... | ||
|
||
|
||
class BytesIOStream(IOStream): | ||
def getvalue(self) -> bytes: ... | ||
|
||
|
||
class FileStream(IOStream): | ||
def __init__(self, filename: str, mode: str = "rb") -> None: ... | ||
|
||
|
||
class CompressingStream(IOStream): | ||
def begin_member(self) -> int: ... | ||
def end_member(self) -> int: ... | ||
|
||
|
||
class BrotliStream(CompressingStream): | ||
def __init__( | ||
self, raw_stream: IOStream, quality: int = 11, lgwin: int = 22, lgblock: int = 0 | ||
) -> None: ... | ||
|
||
|
||
class GZipStream(CompressingStream): | ||
def __init__( | ||
self, raw_stream: IOStream, compression_level: int = 9, zlib: bool = False | ||
) -> None: ... | ||
|
||
|
||
class LZ4Stream(CompressingStream): | ||
def __init__( | ||
self, | ||
raw_stream: IOStream, | ||
compression_level: int = 12, | ||
favor_dec_speed: bool = True, | ||
) -> None: ... | ||
def prepopulate(self, initial_data: bytes) -> None: ... | ||
|
||
|
||
class PythonIOStreamAdapter(IOStream): | ||
def __init__(self, py_stream: IO) -> None: ... | ||
|
||
|
||
class FastWARCError(Exception): | ||
pass | ||
|
||
|
||
class ReaderStaleError(FastWARCError): | ||
pass | ||
|
||
|
||
class StreamError(FastWARCError): | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
from enum import IntFlag | ||
from typing import Union, Type, Iterator, Tuple | ||
|
||
from .stream_io import IOStream | ||
from .warc import WarcRecord | ||
|
||
|
||
class CompressionAlg(IntFlag): | ||
gzip = 0 | ||
lz4 = 1 | ||
uncompressed = 2 | ||
auto = 3 | ||
|
||
|
||
def detect_compression_algorithm(file: str) -> CompressionAlg: ... | ||
|
||
|
||
def wrap_warc_stream( | ||
file: Union[str, Type[IOStream]], | ||
mode: str, | ||
comp_alg: CompressionAlg = CompressionAlg.auto, | ||
**comp_args | ||
) -> Type[IOStream]: ... | ||
|
||
|
||
def recompress_warc_interactive( | ||
warc_in: Union[str, Type[IOStream]], | ||
warc_out: Union[str, Type[IOStream]], | ||
comp_alg_in: CompressionAlg = CompressionAlg.auto, | ||
comp_alg_out: CompressionAlg = CompressionAlg.auto, | ||
**comp_args | ||
) -> Iterator[Tuple[WarcRecord, int]]: ... | ||
|
||
|
||
def recompress_warc( | ||
warc_in: Union[str, Type[IOStream]], | ||
warc_out: Union[str, Type[IOStream]], | ||
comp_alg_in: CompressionAlg = CompressionAlg.auto, | ||
comp_alg_out: CompressionAlg = CompressionAlg.auto, | ||
**comp_args | ||
) -> Iterator[Tuple[WarcRecord, int]]: ... | ||
|
||
|
||
def verify_digests( | ||
warc_in: Union[str, Type[IOStream]], | ||
verify_payloads: bool = False, | ||
comp_alg: CompressionAlg = CompressionAlg.auto, | ||
) -> bool: ... |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
from datetime import datetime | ||
from typing import ( | ||
Optional, | ||
Iterator, | ||
Dict, | ||
Tuple, | ||
MutableMapping, | ||
Iterable, | ||
ValuesView, | ||
KeysView, | ||
Type, | ||
) | ||
from enum import IntFlag | ||
|
||
from .stream_io import BufferedReader, IOStream | ||
|
||
|
||
class WarcRecordType(IntFlag): | ||
warcinfo = 2 | ||
response = 4 | ||
resource = 8 | ||
request = 16 | ||
metadata = 32 | ||
revisit = 64 | ||
conversion = 128 | ||
continuation = 256 | ||
unknown = 512 | ||
any_type = 65535 | ||
no_type = 0 | ||
|
||
|
||
no_type = WarcRecordType.no_type | ||
any_type = WarcRecordType.any_type | ||
|
||
|
||
class WarcHeaderMap(MutableMapping[str, str]): | ||
reason_phrase: Optional[str] | ||
status_code: Optional[str] | ||
status_line: str | ||
|
||
def append(self, key: str, value: str) -> None: ... | ||
def asdict(self) -> Dict[str, str]: ... | ||
def astuples(self) -> Tuple[str, str]: ... | ||
def clear(self) -> None: ... | ||
def get(self, key: str, default: Optional[str] = None) -> Optional[str]: ... | ||
def items(self) -> Iterator[Tuple[str, str]]: ... | ||
def keys(self) -> KeysView[str]: ... | ||
def values(self) -> ValuesView[str]: ... | ||
def write(self, stream: IOStream) -> None: ... | ||
|
||
|
||
class WarcRecord: | ||
record_id: str | ||
record_type: WarcRecordType | ||
content_length: int | ||
record_date: Optional[datetime] | ||
headers: WarcHeaderMap | ||
is_http: bool | ||
is_http_parsed: bool | ||
http_headers: Optional[WarcHeaderMap] | ||
http_content_type: Optional[str] | ||
http_content_type: Optional[str] | ||
http_charset: Optional[str] | ||
http_date: Optional[datetime] | ||
http_last_modified: Optional[datetime] | ||
content_length: int | ||
reader: BufferedReader | ||
stream_pos: int | ||
|
||
def init_headers( | ||
self, content_length: int = 0, record_type=no_type, record_urn=None | ||
): ... | ||
def freeze(self) -> bool: ... | ||
def set_bytes_content(self, content: bytes) -> None: ... | ||
def parse_http(self, strict_mode=True, auto_decode: str = "none") -> None: ... | ||
def verify_block_digest(self, consume: bool = False) -> bool: ... | ||
def verify_payload_digest(self, consume: bool = False) -> bool: ... | ||
|
||
|
||
class ArchiveIterator(Iterable[WarcRecord]): | ||
def __init__( | ||
self, | ||
stream: Type[IOStream], | ||
record_types: WarcRecordType = any_type, | ||
parse_http: bool = True, | ||
min_content_length: int = -1, | ||
max_content_length: int = -1, | ||
) -> None: ... | ||
def __iter__(self) -> Iterator[WarcRecord]: ... | ||
def __next__(self) -> WarcRecord: ... |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters