U
    dDe                     @   s   d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ G dd dZdS )	    N)AttrDict)evaluate_functional_correctness)AutoTokenizer)HumanEvalDataset)cleanup_codec                   @   s:   e Zd ZdZdddZe dd Zdd Zdd Z	dS )	HumanEvalz%
    HumanEval evaluation class.
       python      Nr   Fffffff? T(      c              
   C   s   || _ || _|| _|| _|| _|| _|| _|| _|| _|| _	|	| _
|d dd| _|| _tj| jdd |d}ztj|ddd| _W n2 tk
r } zt| dstW 5 d }~X Y nX d S )	N
model_path/_T)exist_okcls)trust_remote_codeF)	data_rootmax_seq_lenmax_gen_len
batch_sizekn_samplelanguagelog_dirsfttemperaturetop_preplace
model_nameinference_incrementosmakedirspopr   from_pretrained	tokenizer	ExceptionprintAssertionError)selfr   r   r   r   r   r   r   issftr    r"   r#   tokenizer_cfgr   Zk_sampleZtokenizer_clse r0   \/weka-jd/prod/deepseek/permanent/yangdejian/DeepSeek-Coder/Evaluation/HumanEval/humaneval.py__init__   s(    
zHumanEval.__init__c                     s   j dk	stdt j j j jd}t| j }|j}|j	} j
dkr` jdks`td|  tt||}|| } fdd|D }	t|	}
d	}tj j  j d
| d j d j d}t|d}t }td	t|	 jD ]}g }g }g }g }g }|	|| j  D ]d}|| }|d  }||  j|}||d  |t| || ||d  qt||j} jd	kr|j| j d jj! j j" jj!d}n|j| j d jj! jj!d}t#|D ]\}}|| } jj$|dd}||| d }t%| jd j|j&} jsL|| d | }|| ||| |d}|'t()|d  |*  |d7 }q +|||
| j q|,  |-   .| |-  dS )z2
        Evaluate the model on HumanEval.
        Nz4log_dir should not be None when evaluating humaneval)Z
sample_numr   r-   r   d   z(HumanEval PASS@100 needs n_sample >= 100c                    s*   g | ]"}t  jD ]}| j | qqS r0   )ranger   ).0xjr,   r0   r1   
<listcomp>@   s       z(HumanEval.eval_model.<locals>.<listcomp>r   _rank_bs
_shot_log_.jsonwpromptZoriginal_prompttask_idT)	input_idsmax_new_tokens	do_sampleeos_token_idr   r    pad_token_idF)rA   rB   rC   rD   rE   )skip_special_tokens	humaneval
)r@   
generationr?   Z	wholecode)/r   r+   r   r   r   r   r   lenprocess_indexnum_processesr   evalnparray_splitr4   r$   pathjoinr"   r   opentimestripappendr(   encodetorchtensortodevicer   generater   rD   r    	enumeratedecoder   Z	stopwordswritejsondumpsflush	log_scoreclosewait_for_everyone_calculate_final_score) r,   gptacceleratordatasetZnpromptdp_rankdp_sizeZprompt_indices_splitZprompt_indicesindicesall_numprocessed_numZlog_filetmpfile
start_timeidxZprompt_listZprompt_lensZorriginal_prompt_listZtokenized_prompt_lensZtaskidr7   dataZfprompttmprA   decoded	local_idxtext
predictionZsuffixpredictionresr0   r8   r1   
eval_model0   s    

 




zHumanEval.eval_modelc                 C   s   t j d }t | | | }td| dt| d| d|dd|| | d | d	 dd
d|dd| dd ||krtdt | d	 dd
dd dS )z 
        Log the score.
        i   @zDP RANK:z process_num/all_num:r   z avg_time_per_batch:z.2fz s still_need:r   <   z mzmem:z.3fz GiB bs:T)ra   zEVAL DONE! Process time N)rW   cudamax_memory_allocatedrS   r*   int)r,   ri   rm   rl   ro   bsmemZavg_timer0   r0   r1   rb      s    >zHumanEval.log_scorec           	      C   s   |j rtj| jd| j d}t|d}t|jD ]V}tj| j| j d| d| j	 d| j
 d}|t|  d  t| q4|  d	}| j
}t|tj| jd
| j
 d| j||d}td|d| j   t| dS )z,
        Calculate the final score.
        Zfinal_z.jsonlr>   r:   r;   r<   r=   rH   
   z
humaneval-)
input_fileZproblem_filetmp_dirtimeoutr   zscore iszpass@%dN)is_local_main_processr$   rP   rQ   r   r"   rR   r4   rL   r   r   r^   readrT   removerc   r   r   r*   r   )	r,   rg   ZlogfilepathlogfileiZ
tmplogfiler   Zrunlangrw   r0   r0   r1   re      s    
.*
z HumanEval._calculate_final_score)r   r	   r
   r   Nr   Fr   r   TNr   r   )
__name__
__module____qualname____doc__r2   rW   no_gradrx   rb   re   r0   r0   r0   r1   r      s&                        

Pr   )rS   stringmultiprocessingr$   numpyrN   r_   rerW   datetime
subprocesstorch.distributeddistributeddistZattrdictr   Zhuman_eval.evaluationr   transformersr   Zutils.datasetr   Zutils.utilsr   r   r0   r0   r0   r1   <module>   s    