deepset-ai · davidsbatista · Dec 9, 2024 · Dec 6, 2024 · Dec 6, 2024 · Dec 9, 2024
@@ -73,7 +73,7 @@ class TransformersZeroShotDocumentClassifier:
     ```
     """
 
-    def __init__(
+    def __init__(  # pylint: disable=too-many-positional-arguments
         self,
         model: str,
         labels: List[str],

@@ -34,7 +34,7 @@ class AzureOpenAIDocumentEmbedder:
     ```
     """
 
-    def __init__(  # noqa: PLR0913 (too-many-arguments)
+    def __init__(  # noqa: PLR0913 (too-many-arguments) # pylint: disable=too-many-positional-arguments
         self,
         azure_endpoint: Optional[str] = None,
         api_version: Optional[str] = "2023-05-15",

@@ -33,7 +33,7 @@ class AzureOpenAITextEmbedder:
     ```
     """
 
-    def __init__(
+    def __init__(  # pylint: disable=too-many-positional-arguments
         self,
         azure_endpoint: Optional[str] = None,
         api_version: Optional[str] = "2023-05-15",

@@ -38,7 +38,7 @@ class OpenAITextEmbedder:
     ```
     """
 
-    def __init__(
+    def __init__(  # pylint: disable=too-many-positional-arguments
         self,
         api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"),
         model: str = "text-embedding-ada-002",

@@ -95,7 +95,7 @@ class ContextRelevanceEvaluator(LLMEvaluator):
     ```
     """
 
-    def __init__(
+    def __init__(  # pylint: disable=too-many-positional-arguments
         self,
         examples: Optional[List[Dict[str, Any]]] = None,
         progress_bar: bool = True,

@@ -82,7 +82,7 @@ class FaithfulnessEvaluator(LLMEvaluator):
     ```
     """
 
-    def __init__(
+    def __init__(  # pylint: disable=too-many-positional-arguments
         self,
         examples: Optional[List[Dict[str, Any]]] = None,
         progress_bar: bool = True,

@@ -47,7 +47,7 @@ class LLMEvaluator:
     ```
     """
 
-    def __init__(
+    def __init__(  # pylint: disable=too-many-positional-arguments
         self,
         instructions: str,
         inputs: List[Tuple[str, Type[List]]],

@@ -55,7 +55,7 @@ class AzureOpenAIGenerator(OpenAIGenerator):
     """
 
     # pylint: disable=super-init-not-called
-    def __init__(
+    def __init__(  # pylint: disable=too-many-positional-arguments
         self,
         azure_endpoint: Optional[str] = None,
         api_version: Optional[str] = "2023-05-15",

@@ -62,7 +62,7 @@ class AzureOpenAIChatGenerator(OpenAIChatGenerator):
     """
 
     # pylint: disable=super-init-not-called
-    def __init__(
+    def __init__(  # pylint: disable=too-many-positional-arguments
         self,
         azure_endpoint: Optional[str] = None,
         api_version: Optional[str] = "2023-05-15",

@@ -71,7 +71,7 @@ class HuggingFaceLocalChatGenerator:
     ```
     """
 
-    def __init__(
+    def __init__(  # pylint: disable=too-many-positional-arguments
         self,
         model: str = "HuggingFaceH4/zephyr-7b-beta",
         task: Optional[Literal["text-generation", "text2text-generation"]] = None,
@@ -295,7 +295,7 @@ def run(self, messages: List[ChatMessage], generation_kwargs: Optional[Dict[str,
         ]
         return {"replies": chat_messages}
 
-    def create_message(
+    def create_message(  # pylint: disable=too-many-positional-arguments
         self,
         text: str,
         index: int,

@@ -38,7 +38,7 @@ class MetaFieldRanker:
     ```
     """
 
-    def __init__(
+    def __init__(  # pylint: disable=too-many-positional-arguments
         self,
         meta_field: str,
         weight: float = 1.0,
@@ -106,6 +106,7 @@ def __init__(
 
     def _validate_params(
         self,
+        *,
         weight: float,
         top_k: Optional[int],
         ranking_mode: Literal["reciprocal_rank_fusion", "linear_score"],
@@ -156,7 +157,7 @@ def _validate_params(
             )
 
     @component.output_types(documents=List[Document])
-    def run(
+    def run(  # pylint: disable=too-many-positional-arguments
         self,
         documents: List[Document],
         top_k: Optional[int] = None,

@@ -51,7 +51,7 @@ class ExtractiveReader:
     ```
     """
 
-    def __init__(
+    def __init__(  # pylint: disable=too-many-positional-arguments
         self,
         model: Union[Path, str] = "deepset/roberta-base-squad2-distilled",
         device: Optional[ComponentDevice] = None,
@@ -192,8 +192,9 @@ def warm_up(self):
             )
             self.device = ComponentDevice.from_multiple(device_map=DeviceMap.from_hf(self.model.hf_device_map))
 
+    @staticmethod
     def _flatten_documents(
-        self, queries: List[str], documents: List[List[Document]]
+        queries: List[str], documents: List[List[Document]]
     ) -> Tuple[List[str], List[Document], List[int]]:
         """
         Flattens queries and Documents so all query-document pairs are arranged along one batch axis.
@@ -203,8 +204,8 @@ def _flatten_documents(
         query_ids = [i for i, documents_ in enumerate(documents) for _ in documents_]
         return flattened_queries, flattened_documents, query_ids
 
-    def _preprocess(
-        self, queries: List[str], documents: List[Document], max_seq_length: int, query_ids: List[int], stride: int
+    def _preprocess(  # pylint: disable=too-many-positional-arguments
+        self, *, queries: List[str], documents: List[Document], max_seq_length: int, query_ids: List[int], stride: int
     ) -> Tuple["torch.Tensor", "torch.Tensor", "torch.Tensor", List["Encoding"], List[int], List[int]]:
         """
         Splits and tokenizes Documents and preserves structures by returning mappings to query and Document IDs.
@@ -256,6 +257,7 @@ def _preprocess(
 
     def _postprocess(
         self,
+        *,
         start: "torch.Tensor",
         end: "torch.Tensor",
         sequence_ids: "torch.Tensor",
@@ -285,9 +287,9 @@ def _postprocess(
         masked_logits = torch.where(mask, logits, -torch.inf)
         probabilities = torch.sigmoid(masked_logits * self.calibration_factor)
 
-        flat_probabilities = probabilities.flatten(-2, -1)  # necessary for topk
+        flat_probabilities = probabilities.flatten(-2, -1)  # necessary for top-k
 
-        # topk can return invalid candidates as well if answers_per_seq > num_valid_candidates
+        # top-k can return invalid candidates as well if answers_per_seq > num_valid_candidates
         # We only keep probability > 0 candidates later on
         candidates = torch.topk(flat_probabilities, answers_per_seq)
         seq_length = logits.shape[-1]
@@ -343,6 +345,7 @@ def _add_answer_page_number(self, answer: ExtractedAnswer) -> ExtractedAnswer:
 
     def _nest_answers(
         self,
+        *,
         start: List[List[int]],
         end: List[List[int]],
         probabilities: "torch.Tensor",
@@ -526,7 +529,7 @@ def deduplicate_by_overlap(
         return deduplicated_answers
 
     @component.output_types(answers=List[ExtractedAnswer])
-    def run(
+    def run(  # pylint: disable=too-many-positional-arguments
         self,
         query: str,
         documents: List[Document],
@@ -594,9 +597,15 @@ def run(
         no_answer = no_answer if no_answer is not None else self.no_answer
         overlap_threshold = overlap_threshold or self.overlap_threshold
 
-        flattened_queries, flattened_documents, query_ids = self._flatten_documents(queries, nested_documents)
+        flattened_queries, flattened_documents, query_ids = ExtractiveReader._flatten_documents(
+            queries, nested_documents
+        )
         input_ids, attention_mask, sequence_ids, encodings, query_ids, document_ids = self._preprocess(
-            flattened_queries, flattened_documents, max_seq_length, query_ids, stride
+            queries=flattened_queries,
+            documents=flattened_documents,
+            max_seq_length=max_seq_length,
+            query_ids=query_ids,
+            stride=stride,
         )
 
         num_batches = math.ceil(input_ids.shape[0] / max_batch_size) if max_batch_size else 1
@@ -625,7 +634,12 @@ def run(
         end_logits = torch.cat(end_logits_list)
 
         start, end, probabilities = self._postprocess(
-            start_logits, end_logits, sequence_ids, attention_mask, answers_per_seq, encodings
+            start=start_logits,
+            end=end_logits,
+            sequence_ids=sequence_ids,
+            attention_mask=attention_mask,
+            answers_per_seq=answers_per_seq,
+            encodings=encodings,
         )
 
         answers = self._nest_answers(

@@ -72,7 +72,7 @@ class TransformersTextRouter:
     ```
     """
 
-    def __init__(
+    def __init__(  # pylint: disable=too-many-positional-arguments
         self,
         model: str,
         labels: Optional[List[str]] = None,

@@ -95,7 +95,7 @@ class TransformersZeroShotTextRouter:
     ```
     """
 
-    def __init__(
+    def __init__(  # pylint: disable=too-many-positional-arguments
         self,
         labels: List[str],
         multi_label: bool = False,

@@ -58,7 +58,7 @@ class InMemoryDocumentStore:
     Stores data in-memory. It's ephemeral and cannot be saved to disk.
     """
 
-    def __init__(
+    def __init__(  # pylint: disable=too-many-positional-arguments
         self,
         bm25_tokenization_regex: str = r"(?u)\b\w\w+\b",
         bm25_algorithm: Literal["BM25Okapi", "BM25L", "BM25Plus"] = "BM25L",
@@ -541,7 +541,7 @@ def bm25_retrieval(
 
         return return_documents
 
-    def embedding_retrieval(
+    def embedding_retrieval(  # pylint: disable=too-many-positional-arguments
         self,
         query_embedding: List[float],
         filters: Optional[Dict[str, Any]] = None,

@@ -188,7 +188,7 @@ def patch_make_records_to_use_kwarg_string_interpolation(original_make_records:
     """A decorator to ensure string interpolation is used."""
 
     @functools.wraps(original_make_records)
-    def _wrapper(name, level, fn, lno, msg, args, exc_info, func=None, extra=None, sinfo=None) -> Any:
+    def _wrapper(name, level, fn, lno, msg, args, exc_info, func=None, extra=None, sinfo=None) -> Any:  # pylint: disable=too-many-positional-arguments
         safe_extra = extra or {}
         try:
             interpolated_msg = msg.format(**safe_extra)

@@ -124,7 +124,7 @@ def to_dict(self) -> Dict[str, Any]:
     return cls
 
 
-def component_class(
+def component_class(  # pylint: disable=too-many-positional-arguments
     name: str,
     input_types: Optional[Dict[str, Any]] = None,
     output_types: Optional[Dict[str, Any]] = None,

@@ -166,7 +166,7 @@ def resolve_hf_device_map(device: Optional[ComponentDevice], model_kwargs: Optio
     return model_kwargs
 
 
-def resolve_hf_pipeline_kwargs(
+def resolve_hf_pipeline_kwargs(  # pylint: disable=too-many-positional-arguments
     huggingface_pipeline_kwargs: Dict[str, Any],
     model: str,
     task: Optional[str],

@@ -242,6 +242,7 @@ max-locals = 45         # Default is 15
 max-module-lines = 2468 # Default is 1000
 max-nested-blocks = 9   # Default is 5
 max-statements = 206    # Default is 50
+
 [tool.pylint.'SIMILARITIES']
 min-similarity-lines = 6
 

@@ -321,7 +321,7 @@ def test_flatten_documents(mock_reader: ExtractiveReader):
 
 def test_preprocess(mock_reader: ExtractiveReader):
     _, _, seq_ids, _, query_ids, doc_ids = mock_reader._preprocess(
-        example_queries * 3, example_documents[0], 384, [1, 1, 1], 0
+        queries=example_queries * 3, documents=example_documents[0], max_seq_length=384, query_ids=[1, 1, 1], stride=0
     )
     expected_seq_ids = torch.full((3, 384), -1, dtype=torch.int)
     expected_seq_ids[:, :16] = 0
@@ -333,7 +333,11 @@ def test_preprocess(mock_reader: ExtractiveReader):
 
 def test_preprocess_splitting(mock_reader: ExtractiveReader):
     _, _, seq_ids, _, query_ids, doc_ids = mock_reader._preprocess(
-        example_queries * 4, example_documents[0] + [Document(content="a" * 64)], 96, [1, 1, 1, 1], 0
+        queries=example_queries * 4,
+        documents=example_documents[0] + [Document(content="a" * 64)],
+        max_seq_length=96,
+        query_ids=[1, 1, 1, 1],
+        stride=0,
     )
     assert seq_ids.shape[0] == 5
     assert query_ids == [1, 1, 1, 1, 1]
@@ -362,7 +366,12 @@ def test_postprocess(mock_reader: ExtractiveReader):
     encoding.token_to_chars = lambda i: (int(i), int(i) + 1)
 
     start_candidates, end_candidates, probs = mock_reader._postprocess(
-        start, end, sequence_ids, attention_mask, 3, [encoding, encoding]
+        start=start,
+        end=end,
+        sequence_ids=sequence_ids,
+        attention_mask=attention_mask,
+        answers_per_seq=3,
+        encodings=[encoding, encoding],
     )
 
     assert len(start_candidates) == len(end_candidates) == len(probs) == 2