Open
Description
Reminder
- I have read the above rules and searched the existing issues.
System Info
bin E:\soft\Anaconda\envs\llama_factory\Lib\site-packages\bitsandbytes\libbitsandbytes_cuda121.dll
llamafactory
version: 0.9.2.dev0- Platform: Windows-11-10.0.22631-SP0
- Python version: 3.12.9
- PyTorch version: 2.3.1+cu121 (GPU)
- Transformers version: 4.48.3
- Datasets version: 3.2.0
- Accelerate version: 1.2.1
- PEFT version: 0.12.0
- TRL version: 0.9.6
- GPU type: NVIDIA RTX A6000
- GPU number: 1
- GPU memory: 47.99GB
Reproduction
[INFO|2025-02-20 17:38:23] llamafactory.data.loader:157 >> Loading dataset identity.json...
[INFO|2025-02-20 17:38:23] llamafactory.data.loader:157 >> Loading dataset AI-ModelScope/adgen...
2025-02-20 17:38:23,388 - modelscope - WARNING - Use trust_remote_code=True. Will invoke codes from adgen. Please make sure that you can trust the external codes.
2025-02-20 17:38:24,076 - modelscope - INFO - No subset_name specified, defaulting to the default
2025-02-20 17:38:24,827 - modelscope - WARNING - Reusing dataset dataset_builder (C:\Users\Administrator\.cache\modelscope\hub\datasets\AI-ModelScope\adgen\master\data_files)
2025-02-20 17:38:24,827 - modelscope - INFO - Generating dataset dataset_builder (C:\Users\Administrator\.cache\modelscope\hub\datasets\AI-ModelScope\adgen\master\data_files)
2025-02-20 17:38:24,828 - modelscope - INFO - Reusing cached meta-data file: C:\Users\Administrator\.cache\modelscope\hub\datasets\AI-ModelScope\adgen\master\data_files\40205ab100713fe38606b45314e8fd2a
Traceback (most recent call last):
File "<frozen runpy>", line 198, in _run_module_as_main
File "<frozen runpy>", line 88, in _run_code
File "E:\soft\Anaconda\envs\llama_factory\Scripts\llamafactory-cli.exe\__main__.py", line 7, in <module>
File "E:\codesource\me\python\LLaMA-Factory\src\llamafactory\cli.py", line 112, in main
run_exp()
File "E:\codesource\me\python\LLaMA-Factory\src\llamafactory\train\tuner.py", line 93, in run_exp
_training_function(config={"args": args, "callbacks": callbacks})
File "E:\codesource\me\python\LLaMA-Factory\src\llamafactory\train\tuner.py", line 67, in _training_function
run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
File "E:\codesource\me\python\LLaMA-Factory\src\llamafactory\train\sft\workflow.py", line 51, in run_sft
dataset_module = get_dataset(template, model_args, data_args, training_args, stage="sft", **tokenizer_module)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "E:\codesource\me\python\LLaMA-Factory\src\llamafactory\data\loader.py", line 320, in get_dataset
eval_dataset = _get_merged_dataset(
^^^^^^^^^^^^^^^^^^^^
File "E:\codesource\me\python\LLaMA-Factory\src\llamafactory\data\loader.py", line 180, in _get_merged_dataset
datasets[dataset_name] = _load_single_dataset(dataset_attr, model_args, data_args, training_args)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "E:\codesource\me\python\LLaMA-Factory\src\llamafactory\data\loader.py", line 99, in _load_single_dataset
dataset = MsDataset.load(
^^^^^^^^^^^^^^^
File "E:\soft\Anaconda\envs\llama_factory\Lib\site-packages\modelscope\msdatasets\ms_dataset.py", line 326, in load
dataset_inst = remote_dataloader_manager.load_dataset(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "E:\soft\Anaconda\envs\llama_factory\Lib\site-packages\modelscope\msdatasets\data_loader\data_loader_manager.py", line 143, in load_dataset
oss_downloader.process()
File "E:\soft\Anaconda\envs\llama_factory\Lib\site-packages\modelscope\msdatasets\data_loader\data_loader.py", line 83, in process
self._prepare_and_download()
File "E:\soft\Anaconda\envs\llama_factory\Lib\site-packages\modelscope\msdatasets\data_loader\data_loader.py", line 153, in _prepare_and_download
self.dataset = self.data_files_manager.fetch_data_files(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "E:\soft\Anaconda\envs\llama_factory\Lib\site-packages\modelscope\msdatasets\data_files\data_files_manager.py", line 116, in fetch_data_files
return builder.as_dataset()
^^^^^^^^^^^^^^^^^^^^
File "E:\soft\Anaconda\envs\llama_factory\Lib\site-packages\modelscope\msdatasets\download\dataset_builder.py", line 250, in as_dataset
k: self._convert_csv_to_dataset(k, v)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "E:\soft\Anaconda\envs\llama_factory\Lib\site-packages\modelscope\msdatasets\download\dataset_builder.py", line 218, in _convert_csv_to_dataset
df = pd.read_csv(
^^^^^^^^^^^^
File "E:\soft\Anaconda\envs\llama_factory\Lib\site-packages\datasets\streaming.py", line 75, in wrapper
return function(*args, download_config=download_config, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "E:\soft\Anaconda\envs\llama_factory\Lib\site-packages\datasets\utils\file_utils.py", line 1212, in xpandas_read_csv
return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "E:\soft\Anaconda\envs\llama_factory\Lib\site-packages\pandas\io\parsers\readers.py", line 1026, in read_csv
return _read(filepath_or_buffer, kwds)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "E:\soft\Anaconda\envs\llama_factory\Lib\site-packages\pandas\io\parsers\readers.py", line 620, in _read
parser = TextFileReader(filepath_or_buffer, **kwds)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "E:\soft\Anaconda\envs\llama_factory\Lib\site-packages\pandas\io\parsers\readers.py", line 1620, in __init__
self._engine = self._make_engine(f, self.engine)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "E:\soft\Anaconda\envs\llama_factory\Lib\site-packages\pandas\io\parsers\readers.py", line 1898, in _make_engine
return mapping[engine](f, **self.options)
File "parsers.pyx", line 574, in pandas._libs.parsers.TextReader.__cinit__
File "parsers.pyx", line 663, in pandas._libs.parsers.TextReader._get_header
File "parsers.pyx", line 874, in pandas._libs.parsers.TextReader._tokenize_rows
File "parsers.pyx", line 891, in pandas._libs.parsers.TextReader._check_tokenize_status
File "parsers.pyx", line 2053, in pandas._libs.parsers.raise_parser_error
File "<frozen codecs>", line 322, in decode
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc0 in position 17: invalid start byte
Others
No response
Activity