o
    ijU                     @  s(  d Z ddlmZ ddlZddlZddlmZ ddlmZm	Z	m
Z
mZ ddlmZmZ ddlmZmZmZmZmZmZ dd	lmZmZ G d
d deZdZeeZede Zd~ddZdddZdddZi dej d fd!ej d"fd#ej dfd$ej!d fd%ej!d"fd&ej!dfd'ej"d fd(ej"d"fd)ej"dfd*ej#d fd+ej#d"fd,ej#dfd-ej$d fd.ej$d"fd/ej$dfd0ej%d fd1ej%d"fi d2ej%dfd3ej&d fd4ej&d"fd5ej&dfd6ej'd fd7ej'd"fd8ej'dfd9ej(d fd:ej(d"fd;ej(dfd<ej)d fd=ej)d"fd>ej)dfd?ej*d fd@ej*d"fdAej*dfdBej+d fi dCej+d"fdDej+dfdEej,d fdFej,d"fdGej,dfdHej-d fdIej-d"fdJej-dfdKej.d fdLej/d fdMej0dfdNej1dfdOej2dfdPej3dfdQej4dfdRej5dfdSej6dfej7dfej8dfej9dfej:dfej;dfej<dfej=dfej>dfej?dfej@dfdT
ZAddWdXZBddZd[ZCdd]d^ZDdd`daZEddfdgZFddhdiZGG djdk dkZHG dldm dmeHZIG dndo doeHZJg dpZKddrdsZLddwdxZMdd|d}ZNdS )ax  Parser for assembly programs.

Limitations:
1. This is only used to parse programs that we've already assembled and linked successfully,
so it's not intended to handle invalid programs gracefully.

2. This is only guaranteed to handle the subset of assembly we use in the book.
  I've included some support for other common assembly instructions but you shouldn't rely on it.
    )annotationsN)Path)	GeneratorListOptionalUnion   )asmtokenize)Expr	ImmediateOpcodeOperandOperatorRegister)TokenTokTypec                   @     e Zd ZdZdS )
ParseErrorzSWe encountered nvalid assembly (or, more likely, valid assembly that don't support)N__name__
__module____qualname____doc__ r   r   g/home/gtoal/gtoal.com/languages/algol60/new-unicode-parser/lang/c/tests4/test_framework/parser/parse.pyr          r   z[_A-Za-z][_A-Za-z0-9]*_lblstrreturnboolc                 C  s"   t }tjdkr	t}t|| duS )zWCould this symbol name be a function or variable name from the original source program?darwinN)C_IDENTIFIERsysplatformMANGLED_C_IDENTIFIERre	fullmatch)r   regexr   r   r   is_valid_c_identifier$   s   
r*   toksList[Token]expectedr   Nonec                 C  s2   |  d}|j|krtd| d| d|  dS )z6Consume next token and fail if it isn't what we expectr   z	Expected z but found z. Remaining tokens: N)poptok_typer   )r+   r-   next_tokr   r   r   expect_next-   s   

r2   toktuple[Opcode, Optional[int]]c                 C  s  |   std|  | dv rtjdfS | dv rtjdfS | dv r&tjdfS | dr0tjdfS | d	rMd}| d
 dkr@d}n| d
 dkrHd}tj|fS | drjd}| d
 dkr]d}n| d
 dkred}tj|fS | dst| drytj	dfS | drtj
dfS | drtjdfS | drtjdfS g d}dd |D }|| }| d dkr| dd |v rtjdfS tD ];}t|}| |r| t|d }d}|dkrd}n|dkrd}n|dkrd}|tjtjtjfv rd}||f  S qtjdfS )zParse an instruction mnemonic; return opcode and inferred size
    Inferred size is used to normalize immediate values later on;
    it's None if we can't infer the size or if this is a floating-point instruction
    zBad mnemonic: )cqocqto   )cdqcltd   )cltcltqcdqemovsdNmovsbr   lmovzcomiucomipxormulset)eggerB   lerA   beaaeppopesczc                 S  s   g | ]}d | qS )nr   ).0ccr   r   r   
<listcomp>   s    z parse_opcode.<locals>.<listcomp>r   jq)isalnumr   r   CDQCDQE
startswithMOVMOVSMOVZCMPXORIMULSETCCJMPCCr   lenPOPPUSHLEAUNKNOWN)r3   sizecondition_codesnegated_condition_codesall_condition_codesopcodeopsuffixr   r   r   parse_opcode6   sj   


















rt   raxr7   eaxr:   alrbxebxblrcxecxclrdxedxdlrdiedidilrsiesisilr8r8dr8br9r9dr9br10r10dr10br11r11dr11br12r12dr12br13r13dr13br14r14dr14br15r15dr15brsprbpripxmm0xmm1xmm2xmm3xmm4xmm5)
xmm6xmm7xmm8xmm9xmm10xmm11xmm12xmm13xmm14xmm15list[Token]tuple[Register, Optional[int]]c                 C  sB   t | tjd | dj}zt| W S  ty    td| w )zwParse register and infer its size in bytes

    <reg> ::= "%" <reg-alias>
    <reg-alias> ::= "rax" | "eax" | etc.
    r+   r-   r   z)expected register name after % but found )r2   r   PERCENTr/   tok_strREG_ALIASESKeyErrorr   )r+   reg_namer   r   r   parse_register   s   
r   r   c                 C  s   t | tjd | d}|j}|tjkrt|jdd}t|S |tj	tj
fv rQ| d}|jtjkr<td| |j t|jdd}|tj
krMt| S t|S td| )au  Parse an immediate value

    NOTE this won't correctly normalize signed/unsigned representations of same value
    e.g. -1 and 255 are the same in one-byte instructions but we won't parse them
    to the same Immediate operand here; we normalize them later in fix_immediate

    <immediate> ::= "$" [ "+" | "-" ] <int>
    <int> ::= ? decimal or hexadecimal integer ?
    r   r   basezbad immediate value: $zBad immediate value: $)r2   r   DOLLARr/   r0   INTintr   r   	PLUS_SIGN
MINUS_SIGNr   )r+   r1   r0   valnum_tokr   r   r   parse_immediate   s   





r   r   c                 C  s   g }	 |  d}|j}|tjkr||j n=|tjkr'|t|jdd n-|tjkr3|t	j
 n!|tjkr?|t	j n|tjkrK|t	j n	| d| 	 |S q)zParse an expression (used as displacement in memory operand)
    NOTE: we don't normalize these, so +10(%rbp) and 10(%rbp) will NOT compare equal

    <expr> ::= { <symbol> | <int> | "+" | "-" }+
    Tr   r   )r/   r0   r   SYMBOLappendr   r   r   r   r   PLUSr   MINUSATinsert)r+   exprr1   tok_typr   r   r   
parse_expr  s$   





r   tuple[Operand, Optional[int]]c                 C  s,  d}d}d}d}| d j tjkrt| }t| tjd | d j tjkr)t| \}}| d}|j tjkr=t	j
||ddfS |j tjkrKtdt|  | d j }|tjkr`t| djdd}n%t| \}}| d j tjkr| d | d j tjkrt| djdd}t| tjd t	
||||dfS )a  
    Parse memory operand

    <memory-operand> ::= [ <expr> ] "(" <guts> ")"
    <guts> ::= <reg> [ "," <idx-and-or-scale> ] // base, with or without other stuff
            | "," <idx-and-or-scale> // no base

    <idx-and-or-scale> ::= <int> | <reg> [ "," [ <int> ]]
    Nr   r   r   )dispr   z8Unexpected token after base register in memory operand: r   )r0   r   
OPEN_PARENr   r2   r   r   r/   CLOSE_PARENr	   MemoryCOMMAr   r   r   r   r   )r+   r   r   idxscaler   r1   next_tok_typer   r   r   parse_memory_operand'  s4   




r   rr   r   rm   Optional[int]c                 C  sR   t | tr'|du rtd| dk r| S | j|ddd}tj|ddd}t|S | S )	z3Normalize immediate values to signed representationNz:Can't interpret immediate b/c instruction size is ambigousr   littleF)length	byteordersignedT)r   r   )
isinstancer   r   to_bytesr   
from_bytes)rr   rm   as_bytesr   r   r   r   fix_immediate`  s   
r   c                 C  s   | d j }|tjkrt| S |tjkrt| dfS t| dkr.|tjkr.| dj	}|dfS |tj
kr<| d t| S t| dkrl|tjkrl| d j tjkrl| d j tjkrl| d j	 d| d j	 }|   |dfS t| S )zlParse the next operand in list of tokens
    <operand> ::= <reg> | <immediate> | <symbol>["@" <symbol>]
    r   Nr         @)r0   r   r   r   r   r   rh   r   r/   r   STARr   clearr   )r+   start_tok_typetargetr   r   r   parse_operandr  s&   





r   c                   @  r   )	DirectivezAny directiveNr   r   r   r   r   r     r   r   c                   @  r   )EnterTextSectionz5Directive that makes text section the current sectionNr   r   r   r   r   r     r   r   c                   @  r   )LeaveTextSectionzLDirective that makes any section other than text section the current sectionNr   r   r   r   r   r     r   r   )z.bssz.data.cstringz.rodataz.literal4.literal8z
.literal16r   tok_listc                 C  s   | d j dkr
t S | d j tv rt S | d j dv r%t| d j  d| d j dkr[| d jtjkr>td| d j  | d j }|dkrJt S |dkrX| d	 j d
krXt S t S t S )ab  Parse a directive and figure out whether it enters the text section, exits it, or neither

    NOTE: unlike earlier parse_* statements, we don't need to consume these tokens.
    tok_list represents a single line of assembly and once we've identified the kind of directive
    we can just discard the rest of the line

    <directive> ::= <text-section> | <non-text-section> | <other-section>
    <text-section> ::= ".text"
                     | ".section" "__TEXT" "," "__text"
                     | ".section" ".text"
    <non-text-section> ::= ".section" <non-text-section name> | ".bss" | ".data" | etc.
    <other-section> ::= <non-section directive> { <any-token> }+
    <non-text-section-name> ::= ? anything other than .text or __TEXT,__text ?
    <non-section-directve> ::= ? any symbol starting with "." other than ".section", ".text", ".bss", etc ?
    r   z.text)z.popsectionz.pushsectionz	.previousz not supportedz.sectionr   z5Expected section name after section directive, found __TEXTr   __text)	r   r   NON_TEXT_SECTIONSr   r   r0   r   r   r   )r   section_namer   r   r   parse_directive  s$   
r   tokensGenerator[Token, None, None]Union[asm.AsmItem, Directive]c           
        s^  t | }|jtjtjfv rt | }|jtjtjfv s|jtjkr(td|j ttjd}t | |}|jtj	kr?t
|jS |g}|jtjtjfvr^|| t | |}|jtjtjfvsK|d jdrjt|S |d}t|j\} g }|rt|\}}	|| |	r s|	 |rt|tjd | std|sz dur fdd	|D }t
||S )
a]  Parse the next instruction, label or directive

    Grammar:
    <statement> ::= <label> | <instruction> | <directive>
    <label> ::= <symbol> ":"
    <instruction> ::= <non-directive-symbol> [ <operand> {"," <operand> }] <statement-break>
    <statement-break> ::= "
" | ";"
    <non-directive-symbol> ::= ? any symbol not starting with "." ?
    z4Expected label, directive, or instruction but found 
r   .r   z$Expected another operand after commaNc                   s   g | ]}t | qS r   )r   )rW   rr   rm   r   r   rY   '  s    z#parse_statement.<locals>.<listcomp>)nextr0   r   	SEMICOLONNEWLINEr   r   r   r   COLONr	   Labelr   r_   r   r/   rt   r   r2   r   Instruction)
r   first_tokennl	cur_tokencur_line
opcode_tokrq   operandsnext_operand
maybe_sizer   r   r   parse_statement  sF   





r  filenamer   dict[str, asm.AssemblyFunction]c              	     s.  i  dddd fd
d}t | dddq}t|}d}d}	 zTt|}|rmt|tr<t|tr:|r8|| d}W q t|tjrTt|rT|rL|| tj	|g d}n |du rft|tj
retd| n|j| nt|trtd}W n	 ty~   Y nw q!W d   n1 sw   Y  |r||  S )zParse an assembly filerS   r   prefixr    c                 S  s2   t jdkr
| | | |r| t|d S | S )z,Backcompat-proof version of str.removeprefix)r   	   N)r$   version_inforemoveprefixr_   rh   )rS   r  r   r   r   remove_prefix0  s
   


z!parse_file.<locals>.remove_prefixfasm.AssemblyFunctionr.   c                   s*   t jdkr| jd}n| j}|  |< d S )Nr"   r   )r$   r%   name)r  keyasm_functionsr  r   r   add_fun8  s   
zparse_file.<locals>.add_funrzutf-8)encodingNTF)r  instructionsz'instruction found outside of function: )rS   r   r  r   r    r   )r  r  r    r.   )openr
   r  r   r   r   r	   r   r*   AssemblyFunctionr   r   r  r   r   StopIteration)r	  r  r  r   current_functionin_text_sectionasm_itemr   r  r   
parse_file+  sX   




/:r   )r   r   r    r!   )r+   r,   r-   r   r    r.   )r3   r   r    r4   )r+   r   r    r   )r+   r   r    r   )r+   r   r    r   )r+   r,   r    r   )rr   r   rm   r   r    r   )r   r,   r    r   )r   r   r    r   )r	  r   r    r
  )Or   
__future__r   r'   r$   pathlibr   typingr   r   r   r    r	   r
   r   r   r   r   r   r   r   r   RuntimeErrorr   C_IDENT_PATTERNcompiler#   r&   r*   r2   rt   AXBXCXDXDISIR8R9R10R11R12R13R14R15SPBPIPXMM0XMM1XMM2XMM3XMM4XMM5XMM6XMM7XMM8XMM9XMM10XMM11XMM12XMM13XMM14XMM15r   r   r   r   r   r   r   r   r   r   r   r   r  r   r   r   r   r   <module>   s  
 


	
	g








	























 
!
"
#
$
%
&
'
(
)
*
+
,
-
.
/
0
1
2
34
A



9
(

0G