bloomberg · pablogsal · Jan 21, 2025 · Jan 21, 2025 · Jan 21, 2025 · Jan 24, 2025
diff --git a/news/220.bugfix.rst b/news/220.bugfix.rst
@@ -0,0 +1,5 @@
+Fix incorrect file offset calculation when analyzing ELF files with
+non-standard ELF layouts. Previously, pystack would fail to correctly analyze
+Python binaries that had non-standard ELF layouts (for example when compiled
+with certain linker options). The fix properly accounts for PT_LOAD segment
+mappings when calculating file offsets.
diff --git a/requirements-test.txt b/requirements-test.txt
@@ -1,5 +1,5 @@
 coverage[toml]
-pyinstaller<6.0
+pyinstaller
 pytest
 pytest-cov
 pytest-xdist

diff --git a/src/pystack/_pystack.pyi b/src/pystack/_pystack.pyi
@@ -24,19 +24,19 @@ class CoreFileAnalyzer:
     def missing_modules(self) -> List[str]: ...
 
 class NativeReportingMode(enum.Enum):
-    ALL: int
-    OFF: int
-    PYTHON: int
+    ALL = 1
+    OFF = 2
+    PYTHON = 3
 
 class StackMethod(enum.Enum):
-    ALL: int
-    ANONYMOUS_MAPS: int
-    AUTO: int
-    BSS: int
-    ELF_DATA: int
-    HEAP: int
-    SYMBOLS: int
-    DEBUG_OFFSETS: int
+    ALL = 1
+    ANONYMOUS_MAPS = 2
+    AUTO = 3
+    BSS = 4
+    ELF_DATA = 5
+    HEAP = 6
+    SYMBOLS = 7
+    DEBUG_OFFSETS = 8
 
 class ProcessManager: ...
 

diff --git a/src/pystack/_pystack/mem.cpp b/src/pystack/_pystack/mem.cpp
@@ -398,17 +398,61 @@ CorefileRemoteMemoryManager::StatusCode
 CorefileRemoteMemoryManager::getMemoryLocationFromCore(remote_addr_t addr, off_t* offset_in_file) const
 {
     auto corefile_it = std::find_if(d_vmaps.cbegin(), d_vmaps.cend(), [&](auto& map) {
-        return (map.Start() <= addr && addr < map.End()) && (map.FileSize() != 0 && map.Offset() != 0);
+        // When considering if the data is in the core file, we need to check if the address is
+        // withing the chunk of the segment in the core file. map.End() corresponds
+        // to the end of the segment in memory when the process was alive but when the core was created not all that data will be in the core,  so we need to use map.FileSize() to get the end of the
+        // segment in the core file.
+        uintptr_t fileEnd = map.Start() + map.FileSize();
+        return (map.Start() <= addr && addr < fileEnd) && (map.FileSize() != 0 && map.Offset() != 0);
     });
     if (corefile_it == d_vmaps.cend()) {
         return StatusCode::ERROR;
     }
 
-    unsigned long base = corefile_it->Offset() - corefile_it->Start();
+    off_t base = corefile_it->Offset() - corefile_it->Start();
     *offset_in_file = base + addr;
     return StatusCode::SUCCESS;
 }
 
+CorefileRemoteMemoryManager::StatusCode
+CorefileRemoteMemoryManager::initLoadSegments(const std::string& filename) const
+{
+    int fd = open(filename.c_str(), O_RDONLY);
+    if (fd < 0) {
+        return StatusCode::ERROR;
+    }
+
+    Elf* elf = elf_begin(fd, ELF_C_READ, nullptr);
+    if (!elf) {
+        close(fd);
+        return StatusCode::ERROR;
+    }
+
+    std::vector<ElfLoadSegment> segments;
+    size_t phnum;
+    if (elf_getphdrnum(elf, &phnum) == 0) {
+        for (size_t i = 0; i < phnum; i++) {
+            GElf_Phdr phdr_mem;
+            GElf_Phdr* phdr = gelf_getphdr(elf, i, &phdr_mem);
+            if (phdr == nullptr) continue;
+
+            if (phdr->p_type == PT_LOAD) {
+                segments.push_back(
+                        {.vaddr = phdr->p_vaddr, .offset = phdr->p_offset, .size = phdr->p_filesz});
+            }
+        }
+    }
+
+    elf_end(elf);
+    close(fd);
+
+    if (!segments.empty()) {
+        d_elf_load_segments_cache[filename] = std::move(segments);
+        return StatusCode::SUCCESS;
+    }
+    return StatusCode::ERROR;
+}
+
 CorefileRemoteMemoryManager::StatusCode
 CorefileRemoteMemoryManager::getMemoryLocationFromElf(
         remote_addr_t addr,
@@ -418,12 +462,46 @@ CorefileRemoteMemoryManager::getMemoryLocationFromElf(
     auto shared_libs_it = std::find_if(d_shared_libs.cbegin(), d_shared_libs.cend(), [&](auto& map) {
         return map.start <= addr && addr <= map.end;
     });
+
     if (shared_libs_it == d_shared_libs.cend()) {
         return StatusCode::ERROR;
     }
+
     *filename = &shared_libs_it->filename;
-    *offset_in_file = addr - shared_libs_it->start;
-    return StatusCode::SUCCESS;
+
+    // Check if we have cached segments for this file
+    auto cache_it = d_elf_load_segments_cache.find(**filename);
+    if (cache_it == d_elf_load_segments_cache.end()) {
+        // Initialize segments if not in cache
+        if (initLoadSegments(**filename) != StatusCode::SUCCESS) {
+            return StatusCode::ERROR;
+        }
+        cache_it = d_elf_load_segments_cache.find(**filename);
+    }
+
+    if (cache_it->second.empty()) {
+        return StatusCode::ERROR;
+    }
+
+    // Get the load address of the elf file
+    remote_addr_t elf_load_addr = cache_it->second[0].vaddr;
+
+    // Now relocate the address to the elf file
+    remote_addr_t symbol_vaddr = addr - (shared_libs_it->start - elf_load_addr);
+
+    // Find the correct segment
+    for (const auto& segment : cache_it->second) {
+        if (symbol_vaddr >= segment.vaddr && symbol_vaddr < segment.vaddr + segment.size) {
+            *offset_in_file = (symbol_vaddr - segment.vaddr) + segment.offset;
+            return StatusCode::SUCCESS;
+        }
+    }
+
+    LOG(ERROR) << "Failed to find the correct segment for address " << std::hex << std::showbase << addr
+               << "(with vaddr offset " << symbol_vaddr << " ) "
+               << " in file " << **filename;
+
+    return StatusCode::ERROR;
 }
 
 bool

diff --git a/src/pystack/_pystack/mem.h b/src/pystack/_pystack/mem.h
@@ -207,6 +207,15 @@ class CorefileRemoteMemoryManager : public AbstractRemoteMemoryManager
         ERROR,
     };
 
+    struct ElfLoadSegment
+    {
+        GElf_Addr vaddr;
+        GElf_Off offset;
+        GElf_Xword size;
+    };
+    // Cache for PT_LOAD segments
+    mutable std::unordered_map<std::string, std::vector<ElfLoadSegment>> d_elf_load_segments_cache;
+
     // Data members
     std::shared_ptr<CoreFileAnalyzer> d_analyzer;
     std::vector<VirtualMap> d_vmaps;
@@ -220,5 +229,6 @@ class CorefileRemoteMemoryManager : public AbstractRemoteMemoryManager
             remote_addr_t addr,
             const std::string** filename,
             off_t* offset_in_file) const;
+    StatusCode initLoadSegments(const std::string& filename) const;
 };
 }  // namespace pystack
diff --git a/src/pystack/_pystack/process.cpp b/src/pystack/_pystack/process.cpp
@@ -750,6 +750,17 @@ AbstractProcessManager::findPythonVersion() const
     }
     int major = (version >> 24) & 0xFF;
     int minor = (version >> 16) & 0xFF;
+
+    if (major == 0 && minor == 0) {
+        LOG(DEBUG) << "Failed to determine Python version from symbols: empty data copied";
+        return {-1, -1};
+    }
+
+    if (major != 2 && major != 3) {
+        LOG(DEBUG) << "Failed to determine Python version from symbols: invalid major version";
+        return {-1, -1};
+    }
+
     LOG(DEBUG) << "Python version determined from symbols: " << major << "." << minor;
     return {major, minor};
 }