U
    J>Be'                     @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	T d dl
mZ d dlmZ d dlmZmZ ddlmZ ddlmZ d	d
dddddddddddddgddddddddd d!g
d"d#d$d%d&d'd(d)d*d+g
d,d-d.d/d0d1d.gd2Zd3d4d5d6d7d8ZdPeeed:d;d<Zeeee ejf eee ejf eejd=d>d?ZdQdBdCZee e dDdEdFZ!ddGdHdIdJdddKdLgd@d@d@dAfeeee"eeee e#e#e#edMdNdOZ$dS )R    N)*)tqdm)defaultdict)ThreadPoolExecutoras_completed   )stream_jsonl)check_correctnesszimport mathz	import rez
import syszimport copyzimport datetimezimport itertoolszimport collectionszimport heapqzimport functoolszimport hashlibzimport numpyzimport numpy as npzimport stringzfrom typing import *zfrom collections import *mathstringsfmtZstrconvtimebytesregexpsortz	math/randz
crypto/md5z#include<stdlib.h>z#include<algorithm>z#include<math.h>z#include<stdio.h>z#include<vector>z#include<string>z#include<climits>z#include<cstring>z#include<iostream>z#include<cassert>zusing System.Numerics;zusing System.Diagnostics;z!using System.Collections.Generic;zusing System.Linq;zusing System.Text;z#using System.Security.Cryptography;)pythongocppcsCPPGoJava
JavaScriptPython)r   r   javajsr   	humaneval)	data_filedataset_typereturnc                 C   sx   |dk	rt | d d| krh| dkrTtjtjt}tj|ddddd} d	d
 t| D }nd| d|S )z<
    Reads a dataset and returns a dictionary of tasks.
    Nz-shot setting...r   z..zhumaneval-xr   datazhumaneval_python.jsonl.gzc                 S   s   i | ]}|d  |qS )task_id ).0taskr"   r"   m/weka-jd/prod/containers/zhuqihao/dev-cpu/upload_code/deepseek-coder/Evaluation/MBPP/human_eval/evaluation.py
<dictcomp>U   s      z read_dataset.<locals>.<dictcomp>z	Dataset: z not supported.)	printlowerospathdirnameabspath__file__joinr   )r   r   Znum_shotcurrent_pathdatasetr"   r"   r%   read_datasetG   s    r1   )num_samplesnum_correctkr   c                    sl   t t t tddd t| t r0t| t|}nt| t|ksDtt| }t	 fddt
||D S )zH
    Estimates pass@k of each problem and returns them in an array.
    )ncr4   r   c              
   S   s:   | | |k rdS dt d|t | | d | d    S )z=
        Calculates 1 - comb(n - c, k) / comb(n, k).
        g      ?r   )npprodarange)r5   r6   r4   r"   r"   r%   	estimatord   s    z%estimate_pass_at_k.<locals>.estimatorc                    s$   g | ]\}} t |t |qS r"   )int)r#   r5   r6   r:   r4   r"   r%   
<listcomp>r   s     z&estimate_pass_at_k.<locals>.<listcomp>)r;   float
isinstance	itertoolsrepeatlenAssertionErroriterr7   arrayzip)r2   r3   r4   Znum_samples_itr"   r<   r%   estimate_pass_at_k[   s    	
rG   Fr   c                 C   s  | d }|r*| d d d || d  S |rXd|| krX|| d dkrX|| d }n|| d }d |}| d }|dkrd td d }|| d | d }	n&|dkrd}
td D ]}||kr|
|d 7 }
q|
d | d | }	n|d	kr|d | }	n|d
krJd}
td
 D ]}|
|d 7 }
q|
d | d | }	n|dkrd|d | }	nj|dkrj|| d }||d}|rd|| kr|| d }n|| d }|| d }g }td D ]>}||kr|dd }|d |kr|d| d q|rPdd dd |D  d }|d | d | | d | }	n|d | | d | }	nd|dkrd}|| d }|| | | | }	n4|dkr|dd dkrd | }|d | d! }	|	S )"z,
    Processes a sample for evaluation.
    r!   
generation
testexample_test r   r   r   r   )r   
javascripttsshr   Zgo232import
test_setupr   /."z	import (
z    c                 S   s   g | ]}|d  qS )rI   r"   )r#   pr"   r"   r%   r=      s     z*process_humaneval_test.<locals>.<listcomp>)rustz
fn main(){ 
 } 
declarationphpN   z<?phpz<?php
z?>)r.   IMPORT_HELPERreplacesplitappend)sampleproblemsrK   is_mbpplanguager!   rJ   coderQ   test_stringZtest_set_upspromptimport_stringZ
other_pkgspkgrV   Zimport_other_pkgsmainrY   r"   r"   r%   process_humaneval_testt   sh     





"

rk   )filenamer   c                 C   sd   g }|  dr"tt| dd}n
t| d}|D ]&}tdd |D r0|t| q0|  |S )z
    Streams a JSONL file.
    z.gzrbrtrc                 s   s   | ]}|   V  qd S )N)isspace)r#   xr"   r"   r%   	<genexpr>   s     z#stream_jsonl_all.<locals>.<genexpr>)endswithgzipopenanyr_   jsonloadsclose)rl   resultsfpliner"   r"   r%   stream_jsonl_all   s    

r}   z./    g      $@z!../data/humaneval_python.jsonl.gz
   d   )
input_filetmp_dir	n_workerstimeoutproblem_fileout_dirr4   test_groundtruthrK   rb   rc   c              	      s  |rt d t|dd}t| }t|d,}g }t }d}tt}|rt d t| D ]}|d }|	dd 
 }|d	krd
}tj||d}|d |d< t||||
|d< |d dkrq^||||||| f}|jtf| }|| ||  d7  < |d7 }q^nt d t|D ]}|d }|	s4|
}|	sH|d	krHd
}|	rRd}tj||d}||d< t||||	|
|d< |d dkrqd|kr|d }n|| }||||||f}|jtf| }|| ||  d7  < |d7 }qt|t|kr
d}nd}t d tt|t|dD ](}| }||d  |d |f q*W 5 Q R X g g   | D ]0}dd |D }t|  t| qptt  |r|} fdd|D }t | n t dt t dt  |S )z:
    Evaluates the functional correctness of a model.
    zExample test...r   )r   )max_workersr   zTesting ground truth...r!   rR   rM   r   
evaluationZcanonical_solutionrH   Z	test_codeNr   zReading samples...r   completion_idTFzRunning test suites...)totalc                 S   s   g | ]}|d  d qS )r   passedr"   )r#   ro   r"   r"   r%   r=     s     z3evaluate_functional_correctness.<locals>.<listcomp>c                    s0   i | ](}|k  rd | t | qS )zpass@)allrG   mean)r#   r4   correctr   r"   r%   r&   $  s     z3evaluate_functional_correctness.<locals>.<dictcomp>zTotal:zCorrect:)r'   r1   r}   r   Counterr   listr   valuesr^   r(   r)   r*   r.   rk   submitr	   r_   rB   r   resultsumr7   rE   )r   r   r   r   r   r   r4   r   rK   rb   rc   ra   Zsample_jsonlexecutorfuturesr   	n_samplesrz   r`   r!   langZtmp_dir_argsfutureZcompletion_id_Zevaluate_pass_at_kr   r   ksZ	pass_at_kr"   r   r%   evaluate_functional_correctness   s    



(



r   )Nr   N)FFr   )%r)   sysfirerw   rt   regexnumpyr7   r@   typing	tqdm.autor   collectionsr   concurrent.futuresr   r   r    r   	executionr	   r\   ZLANGUAGE_NAMEstrDictr1   Unionr;   ListndarrayrG   rk   Iterabler}   r>   boolr   r"   r"   r"   r%   <module>   s   /
   
B