B
    G%e'                 @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	T d dl
mZ d dlmZ d dlmZmZ ddlmZ ddlmZ d	d
dddddddddddddgddddddddd d!g
d"d#d$d%d&d'd(d)d*d+g
d,d-d.d/d0d1d.gd2Zd3d4d5d6d7d8ZdPeeed:d;d<Zeeee ejf eee ejf eejd=d>d?ZdQdBdCZee e dDdEdFZ!ddGdHdIdJdddKdLgd@d@d@dAfeeee"eeee e#e#e#edMdNdOZ$dS )R    N)*)tqdm)defaultdict)ThreadPoolExecutoras_completed   )stream_jsonl)check_correctnesszimport mathz	import rez
import syszimport copyzimport datetimezimport itertoolszimport collectionszimport heapqzimport functoolszimport hashlibzimport numpyzimport numpy as npzimport stringzfrom typing import *zfrom collections import *mathstringsfmtZstrconvtimebytesregexpsortz	math/randz
crypto/md5z#include<stdlib.h>z#include<algorithm>z#include<math.h>z#include<stdio.h>z#include<vector>z#include<string>z#include<climits>z#include<cstring>z#include<iostream>z#include<cassert>zusing System.Numerics;zusing System.Diagnostics;z!using System.Collections.Generic;zusing System.Linq;zusing System.Text;z#using System.Security.Cryptography;)pythongocppcsZCPPZGoJavaZ
JavaScriptPython)r   r   javajsr   	humaneval)	data_filedataset_typereturnc             C   sx   |d k	rt | d d| krh| d krTtjtjt}tj|ddddd} dd	 t| D }nd
| d|S )Nz-shot setting...r   z..zhumaneval-xr   datazhumaneval_python.jsonl.gzc             S   s   i | ]}||d  qS )task_id ).0taskr   r   `/weka-jd/prod/containers/zhuqihao/dev-cpu/hugginfaceeval/eval_benchmark/human_eval/evaluation.py
<dictcomp>R   s    z read_dataset.<locals>.<dictcomp>z	Dataset: z not supported.)	printlowerospathdirnameabspath__file__joinr   )r   r   Znum_shotZcurrent_pathdatasetr   r   r"   read_datasetG   s    r-   )num_samplesnum_correctkr   c                sl   t t t tddd t| t r0t| t|}nt| t|ksDtt| }t	 fddt
||D S )zH
    Estimates pass@k of each problem and returns them in an array.
    )ncr0   r   c          
   S   s:   | | |k rdS dt d|t | | d | d    S )z=
        Calculates 1 - comb(n - c, k) / comb(n, k).
        g      ?r   )npprodarange)r1   r2   r0   r   r   r"   	estimatora   s    z%estimate_pass_at_k.<locals>.estimatorc                s$   g | ]\}} t |t |qS r   )int)r    r1   r2   )r6   r0   r   r"   
<listcomp>o   s    z&estimate_pass_at_k.<locals>.<listcomp>)r7   float
isinstance	itertoolsrepeatlenAssertionErroriterr3   arrayzip)r.   r/   r0   Znum_samples_itr   )r6   r0   r"   estimate_pass_at_kX   s    	
rB   Fr   c             C   s  | d }|r*| d d d || d  S | d }|r`d|| kr`|| d dkr`|| d }n|| d }| d }|dkrd td d }	|	| d | d }
n2|d	krd}x$td	 D ]}||kr||d 7 }qW |d | d | }
n|d
kr
|d | }
n|dkrPd}xtd D ]}||d 7 }q"W |d | d | }
n|dkrj|d | }
nn|dkrt|| d }||d}|rd|| kr|| d }n|| d }|| d }	g }xJtd D ]>}||	kr|dd }|d |kr|d| d qW |rZdd dd |D  d }|	d | d | | d | }
n|	d | | d | }
nd|dkrd}|| d }|| | | | }
n4|dkr|d d dkrd | }|d | d! }
|
S )"Nr   
generation
testpromptexample_test r   r   r   r   )r   
javascripttsshr   Zgo232import
test_setupr   /."z	import (
z    c             S   s   g | ]}|d  qS )rD   r   )r    pr   r   r"   r8      s    z*process_humaneval_test.<locals>.<listcomp>)Zrustz
fn main(){ 
 } 
declarationZphp   z<?phpz<?php
z?>)r+   IMPORT_HELPERreplacesplitappend)sampleproblemsrG   is_mbpplanguager   rF   rE   coderM   Ztest_stringZtest_set_upsZimport_stringZ
other_pkgspkgrR   Zimport_other_pkgsmainrT   r   r   r"   process_humaneval_testq   sh     




"

rb   )filenamer   c             C   sh   g }|  dr"tt| dd}n
t| d}x.|D ]&}tdd |D r2|t| q2W |  |S )Nz.gzrbrtrc             s   s   | ]}|   V  qd S )N)isspace)r    xr   r   r"   	<genexpr>   s    z#stream_jsonl_all.<locals>.<genexpr>)endswithgzipopenanyrY   jsonloadsclose)rc   resultsfpliner   r   r"   stream_jsonl_all   s    


rt   z./    g      $@z!../data/humaneval_python.jsonl.gz
   d   )
input_filetmp_dir	n_workerstimeoutproblem_fileout_dirr0   test_groundtruthrG   r\   r]   c          	      s  |rt d t|dd}t| }t|d8}g }t }d}tt}|rt d xt| D ]}|d }|	dd 
 }|d	krd
}tj||d}|d |d< t||||
|d< |d d krq`||||||| f}|jtf| }|| ||  d7  < |d7 }q`W nt d xt|D ]}|d }|	s:|
}|	sN|d	krNd
}|	rXd}tj||d}||d< t||||	|
|d< |d d krq$d|kr|d }n|| }||||||f}|jtf| }|| ||  d7  < |d7 }q$W t|t|krd}nd}t d x@tt|t|dD ](}| }||d  |d |f q4W W d Q R X g g   x<| D ]0}dd |D }t|  t| q~W tt  |r|} fdd|D }t | n t dt t dt  |S )NzExample test...r   )r   )max_workersr   zTesting ground truth...r   rN   rI   r   Z
evaluationZcanonical_solutionrC   Z	test_coder   zReading samples...r   completion_idTFzRunning test suites...)totalc             S   s   g | ]}|d  d qS )r   passedr   )r    rf   r   r   r"   r8     s    z3evaluate_functional_correctness.<locals>.<listcomp>c                s0   i | ](}|k  rt | d | qS )zpass@)allrB   mean)r    r0   )correctr   r   r"   r#     s   z3evaluate_functional_correctness.<locals>.<dictcomp>zTotal:zCorrect:)r$   r-   rt   r   Counterr   listr   valuesrX   r%   r&   r'   r+   rb   submitr	   rY   r=   r   resultsumr3   r@   )rx   ry   rz   r{   r|   r}   r0   r~   rG   r\   r]   r[   Zsample_jsonlexecutorfuturesr   Z	n_samplesrq   rZ   r   langZtmp_dir_argsfutureZcompletion_id_Zevaluate_pass_at_kr   r   ksZ	pass_at_kr   )r   r   r"   evaluate_functional_correctness   s    



*



r   )Nr   N)FFr   )%r&   sysZfirern   rk   regexnumpyr3   r;   typing	tqdm.autor   collectionsr   concurrent.futuresr   r   r   r   Z	executionr	   rV   ZLANGUAGE_NAMEstrDictr-   Unionr7   ListndarrayrB   rb   Iterablert   r9   boolr   r   r   r   r"   <module>   s     
E