U
    g$:e                     @   s*   d dl Z d dlZd dlZG dd dZdS )    Nc                   @   s0   e Zd ZdddZdddZdd	 Zd
d ZdS )HumanEvalDataset   pythonFc                 C   s   || _ ttj| j d| d | _| | j|}g | _t	t
|D ]"}t	|D ]}| j||  qTqH| jd d | _tjd td| dt
| j  dS )	z
        root: the path to the HumanEval dataset
        sample_num: the number of samples for each prompt
        language: the language of the HumanEval dataset
        issft: whether to use the SFT setting
        z
humaneval-z.jsonlr   	stopwordsi  zRead HumanEval from z, number of samples N)rootopenospathjoin	readlinesdataget_qa_only_data
clean_datarangelenappendr   nprandomseedprint)selfr   
sample_numlanguageissfttmpij r   ^/weka-jd/prod/containers/zhuqihao/dev-cpu/deepseek-coder/Evaluation/HumanEval/utils/dataset.py__init__   s    "zHumanEvalDataset.__init__c                 C   s   g }|D ]r}t |}|d  }d|kr4|d }n|d }|rLd| d}d|kr^|d }ng }|||d ||d q|S )z
        data_json: the jsonl file of HumanEval
        sft: whether to use the SFT setting
        return: a list of dict, each dict contains the prompt, task_id and stopwords
        promptprefixzBelow is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.

### Instruction:
Write a program to perform the given task.

Input:
z

### Response:
Zstop_tokenstask_id)r    r"   original_promptr   )jsonloadsstripr   )r   Z	data_jsonsftansliner    Zorigin_promptsr   r   r   r      s    


z!HumanEvalDataset.get_qa_only_datac                 C   s
   t | jS )z=
        return the number of samples in the dataset
        )r   r   )r   r   r   r   __len__2   s    zHumanEvalDataset.__len__c                 C   s   | j | }|S )z,
        return the sample at index
        )r   )r   indexsampler   r   r   __getitem__8   s    
zHumanEvalDataset.__getitem__N)r   r   F)F)__name__
__module____qualname__r   r   r+   r.   r   r   r   r   r      s   

r   )r   numpyr   r$   r   r   r   r   r   <module>   s   