| """debug_downloaded_data.py - Inspect the downloaded conversation format"""
|
|
|
| import json
|
|
|
| def inspect_downloaded_data():
|
| """Inspect the first few records to understand the format"""
|
|
|
| data_path = "data/conversation_raw/OpenAssistant_oasst1_raw.jsonl"
|
|
|
| print("🔍 Inspecting downloaded OpenAssistant data...")
|
| print("="*50)
|
|
|
| try:
|
| with open(data_path, 'r', encoding='utf-8') as f:
|
| for i in range(5):
|
| line = f.readline().strip()
|
| if line:
|
| record = json.loads(line)
|
| print(f"\nRecord {i+1}:")
|
| print(f"Top-level keys: {list(record.keys())}")
|
|
|
|
|
| for key, value in record.items():
|
| if isinstance(value, str) and len(value) > 100:
|
| value = value[:100] + "..."
|
| elif isinstance(value, dict):
|
| value = f"Dict with keys: {list(value.keys())}"
|
| elif isinstance(value, list):
|
| value = f"List with {len(value)} items"
|
|
|
| print(f" {key}: {value}")
|
|
|
|
|
| for key in ['prompt', 'conversation', 'messages']:
|
| if key in record and isinstance(record[key], (dict, list)):
|
| print(f"\n Exploring {key}:")
|
| nested = record[key]
|
| if isinstance(nested, dict):
|
| print(f" Keys: {list(nested.keys())}")
|
| for nkey, nvalue in list(nested.items())[:3]:
|
| if isinstance(nvalue, str) and len(nvalue) > 50:
|
| nvalue = nvalue[:50] + "..."
|
| print(f" {nkey}: {nvalue}")
|
| elif isinstance(nested, list) and nested:
|
| print(f" First item type: {type(nested[0])}")
|
| if isinstance(nested, dict):
|
| print(f" First item keys: {list(nested.keys())}")
|
|
|
| except Exception as e:
|
| print(f"Error reading file: {e}")
|
|
|
| if __name__ == "__main__":
|
| inspect_downloaded_data()
|
|
|