Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 53 additions & 55 deletions src/launchpad/size/symbols/macho_symbol_sizes.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,18 @@
from typing import NamedTuple

import lief
import sentry_sdk

from launchpad.utils.logging import get_logger

logger = get_logger(__name__)


def _decode_name(raw_name: str | bytes) -> str:
"""Decode a name from bytes if necessary."""
return raw_name.decode("utf-8", errors="replace") if isinstance(raw_name, bytes) else str(raw_name)


class _SymbolSizeData(NamedTuple):
name: str
section_name: str | None
Expand All @@ -34,19 +40,16 @@ def __init__(self, binary: lief.MachO.Binary) -> None:

def get_symbol_sizes(self) -> list[SymbolSize]:
"""Get the symbol sizes."""
symbol_data = list(self._symbol_sizes(self.binary))

symbol_sizes: list[SymbolSize] = []
for data in symbol_data:
symbol_sizes.append(
SymbolSize(
mangled_name=data.name,
section_name=data.section_name,
segment_name=data.segment_name,
address=data.address,
size=data.size,
)
symbol_sizes = [
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's some light cleanup mixed in here as well, left a comment on the bug fix.

SymbolSize(
mangled_name=data.name,
section_name=data.section_name,
segment_name=data.segment_name,
address=data.address,
size=data.size,
)
for data in self._symbol_sizes(self.binary)
]

logger.debug(f"Found {len(symbol_sizes)} symbol sizes")
symbol_sizes.sort(key=lambda x: x.size, reverse=True)
Expand Down Expand Up @@ -78,65 +81,60 @@ def _symbol_sizes(self, bin: lief.MachO.Binary) -> Generator[_SymbolSizeData]:

# sort symbols by their address so we can calculate the distance between them
syms = sorted((s for s in bin.symbols if self._is_measurable(s)), key=lambda s: s.value)
num_syms = len(syms)

cached_section = None
cached_section_va = None

for idx, sym in enumerate(syms):
start = sym.value

section = bin.section_from_virtual_address(start)
if section:
max_section_addr = section.virtual_address + section.size
raw_name = section.name
section_name = (
raw_name.decode("utf-8", errors="replace") if isinstance(raw_name, bytes) else str(raw_name)
)

if section.segment:
raw_seg_name = section.segment.name
segment_name = (
raw_seg_name.decode("utf-8", errors="replace")
if isinstance(raw_seg_name, bytes)
else str(raw_seg_name)
)
else:
segment_name = None
if cached_section_va == start:
section = cached_section
else:
max_section_addr = None
section_name = None
segment_name = None
section = bin.section_from_virtual_address(start)

if not section:
logger.warning("size.macho.symbol_not_found_in_section", extra={"symbol": sym.name})
cached_section = None
cached_section_va = None
continue

# Only calculate the distance between symbols in the same section
if max_section_addr:
if idx + 1 < len(syms):
next_sym = syms[idx + 1]
next_sym_section = bin.section_from_virtual_address(next_sym.value)
end = (
next_sym.value
if next_sym_section and next_sym_section.name == section.name
else max_section_addr
)
else:
end = max_section_addr
max_section_addr = section.virtual_address + section.size
section_name = _decode_name(section.name)
segment_name = _decode_name(section.segment.name) if section.segment else None

if idx + 1 < num_syms:
next_sym = syms[idx + 1]
next_sym_section = bin.section_from_virtual_address(next_sym.value)
cached_section = next_sym_section
cached_section_va = next_sym.value
same_section = (
next_sym_section
and next_sym_section.segment
and next_sym_section.segment.name == section.segment.name
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the core fix, we had a case where a symbol got compared to a section in a different segment.

and next_sym_section.name == section.name
)
end = next_sym.value if same_section else max_section_addr
else:
end = syms[idx + 1].value
end = max_section_addr
cached_section = None
cached_section_va = None

# Convert virtual addresses to file offsets to calculate the disk size
offset_end = bin.virtual_address_to_offset(end)
offset_start = bin.virtual_address_to_offset(start)
size = 0
if not isinstance(offset_end, lief.lief_errors) and not isinstance(offset_start, lief.lief_errors):
raw_size = offset_end - offset_start
if raw_size < 0:
logger.warning(
"size.macho.negative_symbol_size",
extra={
"symbol": sym.name,
"offset_start": offset_start,
"offset_end": offset_end,
"section": section_name,
},
)
error_context = {
"symbol": sym.name,
"offset_start": offset_start,
"offset_end": offset_end,
"section": section_name,
}
logger.warning("size.macho.negative_symbol_size", extra=error_context)
sentry_sdk.capture_message("size.macho.negative_symbol_size", level="error", extras=error_context)
size = max(0, raw_size)
else:
logger.warning(f"Failed to calculate size for symbol {sym.name}")
Expand Down
Loading