bloomberg · pablogsal · Jan 21, 2025 · Jan 21, 2025 · Jan 21, 2025 · Jan 24, 2025
diff --git a/news/220.bugfix.rst b/news/220.bugfix.rst
@@ -0,0 +1,5 @@
+Fix incorrect file offset calculation when analyzing ELF files with
+non-standard ELF layouts. Previously, pystack would fail to correctly analyze
+Python binaries that had non-standard ELF layouts (for example when compiled
+with certain linker options). The fix properly accounts for PT_LOAD segment
+mappings when calculating file offsets.
diff --git a/requirements-test.txt b/requirements-test.txt
@@ -1,5 +1,5 @@
 coverage[toml]
-pyinstaller<6.0
+pyinstaller
 pytest
 pytest-cov
 pytest-xdist

diff --git a/src/pystack/_pystack.pyi b/src/pystack/_pystack.pyi
@@ -24,19 +24,19 @@ class CoreFileAnalyzer:
     def missing_modules(self) -> List[str]: ...
 
 class NativeReportingMode(enum.Enum):
-    ALL: int
-    OFF: int
-    PYTHON: int
+    ALL = 1
+    OFF = 2
+    PYTHON = 3
 
 class StackMethod(enum.Enum):
-    ALL: int
-    ANONYMOUS_MAPS: int
-    AUTO: int
-    BSS: int
-    ELF_DATA: int
-    HEAP: int
-    SYMBOLS: int
-    DEBUG_OFFSETS: int
+    ALL = 1
+    ANONYMOUS_MAPS = 2
+    AUTO = 3
+    BSS = 4
+    ELF_DATA = 5
+    HEAP = 6
+    SYMBOLS = 7
+    DEBUG_OFFSETS = 8
 
 class ProcessManager: ...
 

diff --git a/src/pystack/_pystack/mem.cpp b/src/pystack/_pystack/mem.cpp
@@ -398,17 +398,61 @@ CorefileRemoteMemoryManager::StatusCode
 CorefileRemoteMemoryManager::getMemoryLocationFromCore(remote_addr_t addr, off_t* offset_in_file) const
 {
     auto corefile_it = std::find_if(d_vmaps.cbegin(), d_vmaps.cend(), [&](auto& map) {
-        return (map.Start() <= addr && addr < map.End()) && (map.FileSize() != 0 && map.Offset() != 0);
+        // When considering if the data is in the core file, we need to check if the address is
+        // withing the chunk of the segment that is actually in the core file. map.End() corresponds
+        // to the end of the segment in memory so we need to use map.FileSize() to get the end of the
+        // segment in the core file.
+        uintptr_t fileEnd = map.Start() + map.FileSize();
+        return (map.Start() <= addr && addr < fileEnd) && (map.FileSize() != 0 && map.Offset() != 0);
     });
     if (corefile_it == d_vmaps.cend()) {
         return StatusCode::ERROR;
     }
 
-    unsigned long base = corefile_it->Offset() - corefile_it->Start();
+    off_t base = corefile_it->Offset() - corefile_it->Start();
     *offset_in_file = base + addr;
     return StatusCode::SUCCESS;
 }
 
+CorefileRemoteMemoryManager::StatusCode
+CorefileRemoteMemoryManager::initLoadSegments(const std::string& filename) const
+{
+    int fd = open(filename.c_str(), O_RDONLY);
+    if (fd < 0) {
+        return StatusCode::ERROR;
+    }
+
+    Elf* elf = elf_begin(fd, ELF_C_READ, nullptr);
+    if (!elf) {
+        close(fd);
+        return StatusCode::ERROR;
+    }
+
+    std::vector<ElfLoadSegment> segments;
+    size_t phnum;
+    if (elf_getphdrnum(elf, &phnum) == 0) {
+        for (size_t i = 0; i < phnum; i++) {
+            GElf_Phdr phdr_mem;
+            GElf_Phdr* phdr = gelf_getphdr(elf, i, &phdr_mem);
+            if (phdr == nullptr) continue;
+
+            if (phdr->p_type == PT_LOAD) {
+                segments.push_back(
+                        {.vaddr = phdr->p_vaddr, .offset = phdr->p_offset, .size = phdr->p_filesz});
+            }
+        }
+    }
+
+    elf_end(elf);
+    close(fd);
+
+    if (!segments.empty()) {
+        d_elf_load_segments_cache[filename] = std::move(segments);
+        return StatusCode::SUCCESS;
+    }
+    return StatusCode::ERROR;
+}
+
 CorefileRemoteMemoryManager::StatusCode
 CorefileRemoteMemoryManager::getMemoryLocationFromElf(
         remote_addr_t addr,
@@ -418,12 +462,46 @@ CorefileRemoteMemoryManager::getMemoryLocationFromElf(
     auto shared_libs_it = std::find_if(d_shared_libs.cbegin(), d_shared_libs.cend(), [&](auto& map) {
         return map.start <= addr && addr <= map.end;
     });
+
     if (shared_libs_it == d_shared_libs.cend()) {
         return StatusCode::ERROR;
     }
+
     *filename = &shared_libs_it->filename;
-    *offset_in_file = addr - shared_libs_it->start;
-    return StatusCode::SUCCESS;
+
+    // Check if we have cached segments for this file
+    auto cache_it = d_elf_load_segments_cache.find(**filename);
+    if (cache_it == d_elf_load_segments_cache.end()) {
+        // Initialize segments if not in cache
+        if (initLoadSegments(**filename) != StatusCode::SUCCESS) {
+            return StatusCode::ERROR;
+        }
+        cache_it = d_elf_load_segments_cache.find(**filename);
+    }
+
+    if (cache_it->second.empty()) {
+        return StatusCode::ERROR;
+    }
+
+    // Get the load address of the elf file
+    remote_addr_t elf_load_addr = cache_it->second[0].vaddr;
+
+    // Now relocate the address to the elf file
+    remote_addr_t symbol_vaddr = addr - (shared_libs_it->start - elf_load_addr);
+
+    // Find the correct segment
+    for (const auto& segment : cache_it->second) {
+        if (symbol_vaddr >= segment.vaddr && symbol_vaddr < segment.vaddr + segment.size) {
+            *offset_in_file = (symbol_vaddr - segment.vaddr) + segment.offset;
+            return StatusCode::SUCCESS;
+        }
+    }
+
+    LOG(ERROR) << "Failed to find the correct segment for address " << std::hex << std::showbase << addr
+               << "(with vaddr offset " << symbol_vaddr << " ) "
+               << " in file " << **filename;
+
+    return StatusCode::ERROR;
 }
 
 bool

diff --git a/src/pystack/_pystack/mem.h b/src/pystack/_pystack/mem.h
@@ -207,6 +207,15 @@ class CorefileRemoteMemoryManager : public AbstractRemoteMemoryManager
         ERROR,
     };
 
+    struct ElfLoadSegment
+    {
+        GElf_Addr vaddr;
+        GElf_Off offset;
+        GElf_Xword size;
+    };
+    // Cache for PT_LOAD segments
+    mutable std::unordered_map<std::string, std::vector<ElfLoadSegment>> d_elf_load_segments_cache;
+
     // Data members
     std::shared_ptr<CoreFileAnalyzer> d_analyzer;
     std::vector<VirtualMap> d_vmaps;
@@ -220,5 +229,6 @@ class CorefileRemoteMemoryManager : public AbstractRemoteMemoryManager
             remote_addr_t addr,
             const std::string** filename,
             off_t* offset_in_file) const;
+    StatusCode initLoadSegments(const std::string& filename) const;
 };
 }  // namespace pystack
diff --git a/src/pystack/_pystack/process.cpp b/src/pystack/_pystack/process.cpp
@@ -750,6 +750,17 @@ AbstractProcessManager::findPythonVersion() const
     }
     int major = (version >> 24) & 0xFF;
     int minor = (version >> 16) & 0xFF;
+
+    if (major == 0 && minor == 0) {
+        LOG(DEBUG) << "Failed to determine Python version from symbols: empty data copied";
+        return {-1, -1};
+    }
+
+    if (major != 2 && major != 3) {
+        LOG(DEBUG) << "Failed to determine Python version from symbols: invalid major version";
+        return {-1, -1};
+    }
+
     LOG(DEBUG) << "Python version determined from symbols: " << major << "." << minor;
     return {major, minor};
 }