You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
from datasets import load_dataset, load_from_disk
load_dataset(path='json',
data_files=['./data/alpaca_en_demo.json'],
split='train',
streaming=False)
root@6d84d38746c2:/workspace/LLaMA-Factory# python test.py
Generating train split: 0 examples [00:00, ? examples/s]
Traceback (most recent call last):
File "/usr/local/lib/python3.10/dist-packages/datasets/packaged_modules/json/json.py", line 130, in _generate_tables
pa_table = paj.read_json(
File "pyarrow/_json.pyx", line 308, in pyarrow._json.read_json
File "pyarrow/error.pxi", line 154, in pyarrow.lib.pyarrow_internal_check_status
File "pyarrow/error.pxi", line 91, in pyarrow.lib.check_status
pyarrow.lib.ArrowInvalid: JSON parse error: Column() changed from object to array in row 0
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.10/dist-packages/datasets/builder.py", line 1997, in _prepare_split_single
for _, table in generator:
File "/usr/local/lib/python3.10/dist-packages/datasets/packaged_modules/json/json.py", line 153, in _generate_tables
df = pd.read_json(f, dtype_backend="pyarrow")
File "/usr/local/lib/python3.10/dist-packages/pandas/util/_decorators.py", line 211, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/pandas/util/_decorators.py", line 331, in wrapper
return func(*args, **kwargs)
TypeError: read_json() got an unexpected keyword argument 'dtype_backend'
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/workspace/LLaMA-Factory/test.py", line 3, in <module>
load_dataset(path='json',
File "/usr/local/lib/python3.10/dist-packages/datasets/load.py", line 2616, in load_dataset
builder_instance.download_and_prepare(
File "/usr/local/lib/python3.10/dist-packages/datasets/builder.py", line 1029, in download_and_prepare
self._download_and_prepare(
File "/usr/local/lib/python3.10/dist-packages/datasets/builder.py", line 1124, in _download_and_prepare
self._prepare_split(split_generator, **prepare_split_kwargs)
File "/usr/local/lib/python3.10/dist-packages/datasets/builder.py", line 1884, in _prepare_split
for job_id, done, content in self._prepare_split_single(
File "/usr/local/lib/python3.10/dist-packages/datasets/builder.py", line 2040, in _prepare_split_single
raise DatasetGenerationError("An error occurred while generating the dataset") from e
datasets.exceptions.DatasetGenerationError: An error occurred while generating the dataset
Expected behavior
No response
Others
No response
The text was updated successfully, but these errors were encountered:
Reminder
System Info
Dockerfile中的nvcr.io/nvidia/pytorch:24.02-py3的pandas的版本是1.5.3
requirements.txt中没有对pandas版本做约束导致最终pandas的版本没有升级到2.x
datasets 2.20.0的版本依赖的是2.x的pandas版本
使用1.x的pandas会报错
File /usr/local/lib/python3.10/dist-packages/datasets/packaged_modules/json/json.py:153, in Json._generate_tables(self, files)
150 with open(
151 file, encoding=self.config.encoding, errors=self.config.encoding_errors
152 ) as f:
--> 153 df = pd.read_json(f, dtype_backend="pyarrow")
154 except ValueError:
Reproduction
先按dockerfile复现环境
运行如下脚本报错
Expected behavior
No response
Others
No response
The text was updated successfully, but these errors were encountered: