获取装配说明的大小

tif*_*ifa 7 x86 assembly machine-code opcode disassembly

我需要从内存中的一个小代码段逐个读取指令,我必须找出内存中指令的大小.

以下只是解释我的问题的原始反汇编代码的示例:

 (gdb) disas /r 0x400281,+8
 Dump of assembler code from 0x400281 to 0x400289:
    0x0000000000400281:  48 89 c7       movq   %rax, %rdi
    0x0000000000400284:  b0 00          movb   $0, %al
    0x0000000000400286:  e8 f2 48 00 00 callq  0x10001f30a
 End of assembler dump.
Run Code Online (Sandbox Code Playgroud)

我知道第一条指令的存储器地址(在这种情况下p = 0x0000000000400281),我可以从p读取每个存储器地址.问题是我不知道*(p + offset)的值是否是操作码,我知道每个操作码的大小信息都不固定.

那么,我可以得到每个汇编指令的大小吗?或者我可以知道我读的值是操作码还是信息?

Ira*_*ter 12

@ AlexisWilke的回答是对的:这很麻烦.他也为工作提供了正确的见解和参考.

我用C完成了这项工作.代码如下; 这用于生产环境.

注意事项:它确实是传统x86指令集的一个很好的部分,但不是全部,特别是没有涉及向量寄存器集的指令.它包含我们在代码中使用的一些"虚拟"指令的解码.我不认为将此扩展到x86-64会很困难,但它会变得更加混乱.最后,这是直接取消,但我不保证这将编译开箱即用.

/* (C) Copyright 2012-2014 Semantic Designs, Inc.
   You may freely use this code provided you retain this copyright message
*/

typedef unsigned int natural;

natural InstructionLength(BYTE* pc)
{ // returns length of instruction at PC
   natural length=0;
   natural opcode, opcode2;
   natural modrm;
   natural sib;
   BYTE* p=pc;

   while (true)
    {  // scan across prefix bytes
       opcode=*p++;
       switch (opcode)
       {  case 0x64: case 0x65: // FS: GS: prefixes
      case 0x36: // SS: prefix
      case 0x66: case 0x67: // operand size overrides
      case 0xF0: case 0xF2: // LOCK, REPNE prefixes
          length++;
              break;
          case 0x2E: // CS: prefix, used as HNT prefix on jumps
          case 0x3E: // DS: prefix, used as HT prefix on jumps
              length++;
              // goto process relative jmp // tighter check possible here
              break;
           default: 
              goto process_instruction_body;
       } 
    }

process_instruction_body:
switch(opcode) // switch on main opcode
{
       // ONE BYTE OPCODE, move to next opcode without remark
       case 0x27: case 0x2F:
       case 0x37: case 0x3F:
       case 0x40: case 0x41: case 0x42: case 0x43: case 0x44: case 0x45: case 0x46: case 0x47:
       case 0x48: case 0x49: case 0x4A: case 0x4B: case 0x4C: case 0x4D: case 0x4E: case 0x4F:
       case 0x50: case 0x51: case 0x52: case 0x53: case 0x54: case 0x55: case 0x56: case 0x57:
   case 0x58: case 0x59: case 0x5A: case 0x5B: case 0x5C: case 0x5D: case 0x5E: case 0x5F:
       case 0x90: // nop
       case 0x91: case 0x92: case 0x93: case 0x94: case 0x95: case 0x96: case 0x97: // xchg
   case 0x98: case 0x99:
       case 0x9C: case 0x9D: case 0x9E: case 0x9F:
       case 0xA4: case 0xA5: case 0xA6: case 0xA7: case 0xAA: case 0xAB: // string operators
       case 0xAC: case 0xAD: case 0xAE: case 0xAF:
   /* case 0xC3: // RET handled elsewhere */ 
       case 0xC9:
       case 0xCC: // int3
       case 0xF5: case 0xF8: case 0xF9: case 0xFC: case 0xFD: 
          return length+1; // include opcode

       case 0xC3: // RET
           if (*p++ != 0xCC)
              return length+1;
           if (*p++ != 0xCC)
              return length+2;
           if (*p++ == 0xCC
               && *p++ == 0xCC)
            return length+5;
        goto error;

    // TWO BYTE INSTRUCTION
    case 0x04: case 0x0C: case 0x14: case 0x1C: case 0x24: case 0x2C: case 0x34: case 0x3C:
    case 0x6A:
    case 0xB0: case 0xB1: case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6: case 0xB7:
        case 0xC2:
           return length+2;

    // TWO BYTE RELATIVE BRANCH
       case 0x70: case 0x71: case 0x72: case 0x73: case 0x74: case 0x75: case 0x76: case 0x77:
       case 0x78: case 0x79: case 0x7A: case 0x7B: case 0x7C: case 0x7D: case 0x7E: case 0x7F:
       case 0xE0: case 0xE1: case 0xE2: case 0xE3: case 0xEB:
           return length+2;

       // THREE BYTE INSTRUCTION (NONE!)

   // FIVE BYTE INSTRUCTION:
       case 0x05: case 0x0D: case 0x15: case 0x1D: 
       case 0x25: case 0x2D: case 0x35: case 0x3D:
       case 0x68:
       case 0xA9:
       case 0xB8: case 0xB9: case 0xBA: case 0xBB: case 0xBC: case 0xBD: case 0xBE: case 0xBF:
        return length+5;

   // FIVE BYTE RELATIVE CALL
   case 0xE8:
         return length+5;

   // FIVE BYTE RELATIVE BRANCH
   case 0xE9:
         if (p[4]==0xCC)
                return length+6; // <jmp near ptr ...  int 3>
         return length+5; // plain <jmp near ptr>

       // FIVE BYTE DIRECT ADDRESS
       case 0xA1: case 0xA2: case 0xA3: // MOV AL,AX,EAX moffset...
         return length+5;
         break;

      // ModR/M with no immediate operand
      case 0x00: case 0x01: case 0x02: case 0x03: case 0x08: case 0x09: case 0x0A: case 0x0B:
      case 0x10: case 0x11: case 0x12: case 0x13: case 0x18: case 0x19: case 0x1A: case 0x1B:
      case 0x20: case 0x21: case 0x22: case 0x23: case 0x28: case 0x29: case 0x2A: case 0x2B:
      case 0x30: case 0x31: case 0x32: case 0x33: case 0x38: case 0x39: case 0x3A: case 0x3B:
      case 0x84: case 0x85: case 0x86: case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8D: case 0x8F:
      case 0xD1: case 0xD2: case 0xD3:
      case 0xFE: case 0xFF: // misinterprets JMP far and CALL far, not worth fixing
        length++; // count opcode
            goto modrm;

      // ModR/M with immediate 8 bit value
      case 0x80: case 0x82: case 0x83:
      case 0xC0: case 0xC1: 
      case 0xC6:  // with r=0?
          length+=2; // count opcode and immediate byte
            goto modrm;

      // ModR/M with immediate 32 bit value
      case 0x81: 
      case 0xC7:  // with r=0?
        length+=5; // count opcode and immediate byte
            goto modrm;

      case 0x9B: // FSTSW AX = 9B DF E0
           if (*p++==0xDF)
              { if (*p++==0xE0)
               return length+3;
            printf("InstructionLength: Unimplemented 0x9B tertiary opcode %2x at %x\n",*p,p);
                goto error;
          }
           else { printf("InstructionLength: Unimplemented 0x9B secondary opcode %2x at %x\n",*p,p);
                  goto error;
            }

      case 0xD9: // various FP instructions
           modrm=*p++;
           length++; //  account for FP prefix
           switch (modrm)
           {  case 0xC9: case 0xD0: 
          case 0xE0: case 0xE1: case 0xE4: case 0xE5: 
              case 0xE8: case 0xE9: case 0xEA: case 0xEB: case 0xEC: case 0xED: case 0xEE:
              case 0xF8: case 0xF9: case 0xFA: case 0xFB: case 0xFC: case 0xFD: case 0xFE: case 0xFF:
                  return length+1;
          default:  // r bits matter if not one of the above specific opcodes
                  switch((modrm&0x38)>>3)
                  {  case 0: goto modrm_fetched;  // fld
                 case 1: return length+1; // fxch
                 case 2: goto modrm_fetched; // fst
                 case 3: goto modrm_fetched; // fstp
                 case 4: goto modrm_fetched; // fldenv
                 case 5: goto modrm_fetched; // fldcw
                 case 6: goto modrm_fetched; // fnstenv
                 case 7: goto modrm_fetched; // fnstcw
                  }
                  goto error; // unrecognized 2nd byte
           }

      case 0xDB: // various FP instructions
           modrm=*p++;
           length++; //  account for FP prefix
           switch (modrm)
           {  case 0xE3: 
                  return length+1;
          default:  // r bits matter if not one of the above specific opcodes
#if 0
                  switch((modrm&0x38)>>3)
                  {  case 0: goto modrm_fetched;  // fld
                 case 1: return length+1; // fxch
                 case 2: goto modrm_fetched; // fst
                 case 3: goto modrm_fetched; // fstp
                 case 4: goto modrm_fetched; // fldenv
                 case 5: goto modrm_fetched; // fldcw
                 case 6: goto modrm_fetched; // fnstenv
                 case 7: goto modrm_fetched; // fnstcw
                  }
#endif
                  goto error; // unrecognized 2nd byte
           }

      case 0xDD: // various FP instructions
           modrm=*p++;
           length++; //  account for FP prefix
           switch (modrm)
           {  case 0xE1: case 0xE9: 
              return length+1;
          default:  // r bits matter if not one of the above specific opcodes
                  switch((modrm&0x38)>>3)
                  {  case 0: goto modrm_fetched;  // fld
                 // case 1: return length+1; // fisttp
                 case 2: goto modrm_fetched; // fst
                 case 3: goto modrm_fetched; // fstp
                 case 4: return length+1; // frstor
                 case 5: return length+1; // fucomp
                 case 6: goto modrm_fetched; // fnsav
                 case 7: goto modrm_fetched; // fnstsw
                  }
                  goto error; // unrecognized 2nd byte
           }

      case 0xF3: // funny prefix REPE
           opcode2=*p++;  // get second opcode byte
           switch (opcode2)
       {  case 0x90: // == PAUSE
          case 0xA4: case 0xA5: case 0xA6: case 0xA7: case 0xAA: case 0xAB: // string operators
             return length+2;
              case 0xC3: // (REP) RET
                 if (*p++ != 0xCC)
                    return length+2; // only (REP) RET
                 if (*p++ != 0xCC)
                    goto error;
                 if (*p++ == 0xCC)
                    return length+5; // (REP) RET CLONE IS LONG JUMP RELATIVE
                 goto error;
              case 0x66: // operand size override (32->16 bits)
         if (*p++ == 0xA5) // "rep movsw"
                    return length+3;
                 goto error;
              default: goto error;
           }

      case 0xF6: // funny subblock of opcodes
            modrm=*p++;
            if ((modrm & 0x20) == 0)
               length++; // 8 bit immediate operand
            goto modrm_fetched; 

      case 0xF7: // funny subblock of opcodes
            modrm=*p++;
            if ((modrm & 0x30) == 0)
               length+=4; // 32 bit immediate operand
            goto modrm_fetched; 

      // Intel's special prefix opcode
      case 0x0F:
        length+=2; // add one for special prefix, and one for following opcode
            opcode2=*p++;
        switch(opcode2) 
        { case 0x31: // RDTSC
             return length;

          // CMOVxx
          case 0x40: case 0x41: case 0x42: case 0x43: case 0x44: case 0x45: case 0x46: case 0x47: 
              case 0x48: case 0x49: case 0x4A: case 0x4B: case 0x4C: case 0x4D: case 0x4E: case 0x4F:
              goto modrm;

              // JC relative 32 bits
              case 0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: case 0x87: 
              case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: case 0x8E: case 0x8F:
                  return length+4; // account for subopcode and displacement

          // SETxx rm32
              case 0x90: case 0x91: case 0x92: case 0x93: case 0x94: case 0x95: case 0x96: case 0x97: 
              case 0x98: case 0x99: case 0x9A: case 0x9B: case 0x9C: case 0x9D: case 0x9E: case 0x9F:
                  goto modrm;

              case 0xA2: // CPUID
                  return length+2;

              case 0xAE: // LFENCE, SFENCE, MFENCE
                  opcode2=*p++;
                  switch (opcode2)
                  { case 0xE8: // LFENCE
                case 0xF0: // MFENCE
                    case 0xF8: // SFENCE
                  return length+1;
                    default: 
                      printf("InstructionLength: Unimplemented 0x0F, 0xAE tertiary opcode in clone  %2x at %x\n",opcode2,p-1);
                  goto error;
                  }

              case 0xAF: // imul
              case 0xB0: // cmpxchg 8 bits
                  goto error;

              case 0xB1: // cmpxchg 32 bits
              case 0xB6: case 0xB7: // movzx
              case 0xBC: /* bsf */ case 0xBD: // bsr
              // case 0xBE: case 0xBF: // movsx 
              case 0xC1: // xadd
              case 0xC7: // cmpxchg8b
                  goto modrm;

              default:
                  printf("InstructionLength: Unimplemented 0x0F secondary opcode in clone %2x at %x\n",opcode,p-1);
                  goto error;
    } // switch

 // ALL THE THE REST OF THE INSTRUCTIONS; these are instructions that runtime system shouldn't ever use
     default: 
     /* case 0x26: case 0x36: // ES: SS: prefixes
        case 0x9A:
        case 0xC8: case 0xCA: case 0xCB: case 0xCD: case 0xCE: case 0xCF:
        case 0xD6: case 0xD7:
        case 0xE4: case 0xE5: case 0xE6: case 0xE7: case 0xEA: case 0xEB: case 0xEC: case 0xED: case 0xEF:
        case 0xF4: case 0xFA: case 0xFB:
         */
     printf("InstructionLength: Unexpected opcode %2x\n",opcode);
         goto error;
    }

modrm:
    modrm=*p++;
modrm_fetched:
    if (trace_clone_checking)
       printf("InstructionLength: ModR/M byte %x %2x\n",pc,modrm);
    if (modrm >= 0xC0)
       return length+1;  // account for modrm opcode
    else
    {  /* memory access */
        if ((modrm & 0x7) == 0x04)
    { /* instruction with SIB byte */
                length++; // account for SIB byte
                sib=*p++; // fetch the sib byte
                if ((sib & 0x7) == 0x05)
                   {  if ((modrm & 0xC0) == 0x40)
                     return length+1+1; // account for MOD + byte displacment
                  else return length+1+4; // account for MOD + dword displacement
                   }
            }
        switch(modrm & 0xC0)
        {  case 0x0:
          if ( (modrm & 0x07) == 0x05)
                  return length+5; // 4 byte displacement
              else return length+1; // zero length offset
           case 0x80:
              return length+5;  // 4 byte offset
          default:
      return length+2;  // one byte offset
        }
   }

error:
    {  printf("InstructionLength: unhandled opcode at %8x with opcode %2x\n",pc,opcode);
    }
    return 0; // can't actually execute this
}
Run Code Online (Sandbox Code Playgroud)

  • @CHRIS:我把它留给了有动力的读者. (2认同)

Ale*_*lke 5

解码指令并不那么复杂。然而,由于Intel系列处理器是CISC,这使得这项任务相当艰巨。

\n\n

首先,您不应该用汇编程序编写它,因为这将花费您一两年的时间,但也许您有时间这样做。由于您只需要扫描代码,而不需要打印结果,因此您可以比实际反汇编程序更快地完成工作。话虽这么说,你会遇到同样的主要问题。

\n\n

首先,手册在那里:

\n\n

http://www.intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html?iid=tech_vt_tech+64-32_manuals

\n\n

我建议这个:

\n\n

http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf

\n\n

然后,您所要做的就是读取一个字节并理解它。第 770 页有一个表格,其中显示了从操作码到指令的编码。

\n\n

例如,0x33 表示以 Gb,Ev 作为参数的 XOR。G 表示以下 ModR/M 中定义的通用寄存器。那么b就是大小(字节)。E 表示在该一个字节之后有一个 ModR/M(G 和 E 的字节相同)。因此,您必须读取该一个字节来确定寻址模式,并从中确定寄存器(可以忽略)和地址大小。地址 (Ev) 可以是另一个寄存器(则没有额外字节),它可以是立即数据(1、2、4、8 字节),也可以是地址(同样是 1、2、4、8 字节)。很简单,对吧?请注意,所有指令都使用完全相同的 ModR/M,因此您只需实现一次。此外,在指令代码之后添加字节的顺序始终完全相同。

\n\n

在地址或立即数(如果我是正确的)之前是 64 位指令的额外 Mod。该定义定义了附加模式和对扩展寄存器的支持。所有这些都在我之前提到的文档中进行了详细描述。

\n\n

或多或少,您需要解析器来理解 ModR/M、SIB、前缀和 voil\xc3\xa0。事情没那么复杂。然后第一个字节告诉你指令(如果第一个字节是0x0F,则为前2个字节...)

\n\n

一些指令还支持前缀来调整操作数的大小和其他类似的东西。据我所知,只有0x66(op大小)和0x67(addr大小)对地址和立即数据的大小有影响。其他前缀不会影响指令使用的字节数,因此您可以简单地忽略它们(我们可以计算它们,但不需要知道接下来的内容)。

\n\n
\n\n

综上所述,使用 LLVM 库(正如评论中有人提到的那样)可能是一个更好/更简单的选择,尽管如果您的东西有限,它可能比您需要的要大得多。

\n