Skip to content

Commit

Permalink
dotnet: support property feature extraction (#1168)
Browse files Browse the repository at this point in the history
  • Loading branch information
mike-hunhoff authored Sep 9, 2022
1 parent 580948e commit 3c1cd67
Show file tree
Hide file tree
Showing 15 changed files with 580 additions and 92 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

### New Features
- verify rule metadata format on load #1160 @mr-tz
- extract property features from .NET PE files #1168 @anushkavirgaonkar

### Breaking Changes

Expand Down
31 changes: 24 additions & 7 deletions capa/features/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,14 @@
THUNK_CHAIN_DEPTH_DELTA = 5


class FeatureAccess:
READ = "read"
WRITE = "write"


VALID_FEATURE_ACCESS = (FeatureAccess.READ, FeatureAccess.WRITE)


def bytes_to_str(b: bytes) -> str:
return str(codecs.encode(b, "hex").decode("utf-8"))

Expand Down Expand Up @@ -92,15 +100,19 @@ def __nonzero__(self):


class Feature(abc.ABC):
def __init__(self, value: Union[str, int, float, bytes], description=None):
def __init__(
self,
value: Union[str, int, float, bytes],
description: Optional[str] = None,
):
"""
Args:
value (any): the value of the feature, such as the number or string.
description (str): a human-readable description that explains the feature value.
"""
super(Feature, self).__init__()
self.name = self.__class__.__name__.lower()

self.name = self.__class__.__name__.lower()
self.value = value
self.description = description

Expand All @@ -119,23 +131,28 @@ def __lt__(self, other):
< capa.features.freeze.features.feature_from_capa(other).json()
)

def get_name_str(self) -> str:
"""
render the name of this feature, for use by `__str__` and friends.
subclasses should override to customize the rendering.
"""
return self.name

def get_value_str(self) -> str:
"""
render the value of this feature, for use by `__str__` and friends.
subclasses should override to customize the rendering.
Returns: any
"""
return str(self.value)

def __str__(self):
if self.value is not None:
if self.description:
return "%s(%s = %s)" % (self.name, self.get_value_str(), self.description)
return "%s(%s = %s)" % (self.get_name_str(), self.get_value_str(), self.description)
else:
return "%s(%s)" % (self.name, self.get_value_str())
return "%s(%s)" % (self.get_name_str(), self.get_value_str())
else:
return "%s" % self.name
return "%s" % self.get_name_str()

def __repr__(self):
return str(self)
Expand Down
150 changes: 103 additions & 47 deletions capa/features/extractors/dnfile/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from __future__ import annotations

import logging
from enum import Enum
from typing import Any, Tuple, Iterator, Optional

import dnfile
Expand All @@ -17,6 +18,8 @@
from dncil.clr.token import Token, StringToken, InvalidToken
from dncil.cil.body.reader import CilMethodBodyReaderBase

from capa.features.common import FeatureAccess

logger = logging.getLogger(__name__)

# key indexes to dotnet metadata tables
Expand All @@ -41,72 +44,63 @@ def seek(self, offset: int) -> int:
return self.offset


class DnClass(object):
def __init__(self, token: int, namespace: str, classname: str):
self.token: int = token
self.namespace: str = namespace
self.classname: str = classname
class DnType(object):
def __init__(self, token: int, class_: str, namespace: str = "", member: str = "", access: Optional[str] = None):
self.token = token
self.access = access
self.namespace = namespace
self.class_ = class_
self.member = member

def __hash__(self):
return hash((self.token,))
return hash((self.token, self.access, self.namespace, self.class_, self.member))

def __eq__(self, other):
return self.token == other.token
return (
self.token == other.token
and self.access == other.access
and self.namespace == other.namespace
and self.class_ == other.class_
and self.member == other.member
)

def __str__(self):
return DnClass.format_name(self.namespace, self.classname)
return DnType.format_name(self.class_, namespace=self.namespace, member=self.member)

def __repr__(self):
return str(self)

@staticmethod
def format_name(namespace: str, classname: str):
name: str = classname
if namespace:
# like System.IO.File::OpenRead
name = f"{namespace}.{name}"
return name


class DnMethod(DnClass):
def __init__(self, token: int, namespace: str, classname: str, methodname: str):
super(DnMethod, self).__init__(token, namespace, classname)
self.methodname: str = methodname

def __str__(self):
return DnMethod.format_name(self.namespace, self.classname, self.methodname)

@staticmethod
def format_name(namespace: str, classname: str, methodname: str): # type: ignore
def format_name(class_: str, namespace: str = "", member: str = ""):
# like File::OpenRead
name: str = f"{classname}::{methodname}"
name: str = f"{class_}::{member}" if member else class_
if namespace:
# like System.IO.File::OpenRead
name = f"{namespace}.{name}"
return name


class DnUnmanagedMethod:
def __init__(self, token: int, modulename: str, methodname: str):
def __init__(self, token: int, module: str, method: str):
self.token: int = token
self.modulename: str = modulename
self.methodname: str = methodname
self.module: str = module
self.method: str = method

def __hash__(self):
return hash((self.token,))
return hash((self.token, self.module, self.method))

def __eq__(self, other):
return self.token == other.token
return self.token == other.token and self.module == other.module and self.method == other.method

def __str__(self):
return DnUnmanagedMethod.format_name(self.modulename, self.methodname)
return DnUnmanagedMethod.format_name(self.module, self.method)

def __repr__(self):
return str(self)

@staticmethod
def format_name(modulename, methodname):
return f"{modulename}.{methodname}"
def format_name(module, method):
return f"{module}.{method}"


def resolve_dotnet_token(pe: dnfile.dnPE, token: Token) -> Any:
Expand Down Expand Up @@ -139,7 +133,7 @@ def read_dotnet_method_body(pe: dnfile.dnPE, row: dnfile.mdtable.MethodDefRow) -
try:
return CilMethodBody(DnfileMethodBodyReader(pe, row))
except MethodBodyFormatError as e:
logger.warn("failed to parse managed method body @ 0x%08x (%s)" % (row.Rva, e))
logger.warning("failed to parse managed method body @ 0x%08x (%s)" % (row.Rva, e))
return None


Expand All @@ -148,7 +142,7 @@ def read_dotnet_user_string(pe: dnfile.dnPE, token: StringToken) -> Optional[str
try:
user_string: Optional[dnfile.stream.UserString] = pe.net.user_strings.get_us(token.rid)
except UnicodeDecodeError as e:
logger.warn("failed to decode #US stream index 0x%08x (%s)" % (token.rid, e))
logger.warning("failed to decode #US stream index 0x%08x (%s)" % (token.rid, e))
return None

if user_string is None:
Expand All @@ -157,7 +151,7 @@ def read_dotnet_user_string(pe: dnfile.dnPE, token: StringToken) -> Optional[str
return user_string.value


def get_dotnet_managed_imports(pe: dnfile.dnPE) -> Iterator[DnMethod]:
def get_dotnet_managed_imports(pe: dnfile.dnPE) -> Iterator[DnType]:
"""get managed imports from MemberRef table
see https://www.ntcore.com/files/dotnetformat.htm
Expand All @@ -174,12 +168,11 @@ def get_dotnet_managed_imports(pe: dnfile.dnPE) -> Iterator[DnMethod]:
for (rid, row) in enumerate(iter_dotnet_table(pe, "MemberRef")):
if not isinstance(row.Class.row, dnfile.mdtable.TypeRefRow):
continue

token: int = calculate_dotnet_token_value(pe.net.mdtables.MemberRef.number, rid + 1)
yield DnMethod(token, row.Class.row.TypeNamespace, row.Class.row.TypeName, row.Name)
yield DnType(token, row.Class.row.TypeName, namespace=row.Class.row.TypeNamespace, member=row.Name)


def get_dotnet_managed_methods(pe: dnfile.dnPE) -> Iterator[DnMethod]:
def get_dotnet_managed_methods(pe: dnfile.dnPE) -> Iterator[DnType]:
"""get managed method names from TypeDef table
see https://www.ntcore.com/files/dotnetformat.htm
Expand All @@ -193,7 +186,70 @@ def get_dotnet_managed_methods(pe: dnfile.dnPE) -> Iterator[DnMethod]:
for row in iter_dotnet_table(pe, "TypeDef"):
for index in row.MethodList:
token = calculate_dotnet_token_value(index.table.number, index.row_index)
yield DnMethod(token, row.TypeNamespace, row.TypeName, index.row.Name)
yield DnType(token, row.TypeName, namespace=row.TypeNamespace, member=index.row.Name)


def get_dotnet_fields(pe: dnfile.dnPE) -> Iterator[DnType]:
"""get fields from TypeDef table"""
for row in iter_dotnet_table(pe, "TypeDef"):
for index in row.FieldList:
token = calculate_dotnet_token_value(index.table.number, index.row_index)
yield DnType(token, row.TypeName, namespace=row.TypeNamespace, member=index.row.Name)


def get_dotnet_property_map(
pe: dnfile.dnPE, property_row: dnfile.mdtable.PropertyRow
) -> Optional[dnfile.mdtable.TypeDefRow]:
"""get property map from PropertyMap table
see https://www.ntcore.com/files/dotnetformat.htm
21 - PropertyMap Table
List of Properties owned by a specific class.
Parent (index into the TypeDef table)
PropertyList (index into Property table). It marks the first of a contiguous run of Properties owned by Parent. The run continues to the smaller of:
the last row of the Property table
the next run of Properties, found by inspecting the PropertyList of the next row in this PropertyMap table
"""
for row in iter_dotnet_table(pe, "PropertyMap"):
for index in row.PropertyList:
if index.row.Name == property_row.Name:
return row.Parent.row
return None


def get_dotnet_properties(pe: dnfile.dnPE) -> Iterator[DnType]:
"""get property from MethodSemantics table
see https://www.ntcore.com/files/dotnetformat.htm
24 - MethodSemantics Table
Links Events and Properties to specific methods. For example one Event can be associated to more methods. A property uses this table to associate get/set methods.
Semantics (a 2-byte bitmask of type MethodSemanticsAttributes)
Method (index into the MethodDef table)
Association (index into the Event or Property table; more precisely, a HasSemantics coded index)
"""
for row in iter_dotnet_table(pe, "MethodSemantics"):
typedef_row = get_dotnet_property_map(pe, row.Association.row)
if typedef_row is None:
continue

token = calculate_dotnet_token_value(row.Method.table.number, row.Method.row_index)

if row.Semantics.msSetter:
access = FeatureAccess.WRITE
elif row.Semantics.msGetter:
access = FeatureAccess.READ
else:
access = None

yield DnType(
token,
typedef_row.TypeName,
access=access,
namespace=typedef_row.TypeNamespace,
member=row.Association.row.Name,
)


def get_dotnet_managed_method_bodies(pe: dnfile.dnPE) -> Iterator[Tuple[int, CilMethodBody]]:
Expand Down Expand Up @@ -226,20 +282,20 @@ def get_dotnet_unmanaged_imports(pe: dnfile.dnPE) -> Iterator[DnUnmanagedMethod]
ImportScope (index into the ModuleRef table)
"""
for row in iter_dotnet_table(pe, "ImplMap"):
modulename: str = row.ImportScope.row.Name
methodname: str = row.ImportName
module: str = row.ImportScope.row.Name
method: str = row.ImportName

# ECMA says "Each row of the ImplMap table associates a row in the MethodDef table (MemberForwarded) with the
# name of a routine (ImportName) in some unmanaged DLL (ImportScope)"; so we calculate and map the MemberForwarded
# MethodDef table token to help us later record native import method calls made from CIL
token: int = calculate_dotnet_token_value(row.MemberForwarded.table.number, row.MemberForwarded.row_index)

# like Kernel32.dll
if modulename and "." in modulename:
modulename = modulename.split(".")[0]
if module and "." in module:
module = module.split(".")[0]

# like kernel32.CreateFileA
yield DnUnmanagedMethod(token, modulename, methodname)
yield DnUnmanagedMethod(token, module, method)


def calculate_dotnet_token_value(table: int, rid: int) -> int:
Expand Down
Loading

0 comments on commit 3c1cd67

Please sign in to comment.