| import torch |
| torch.jit.script = lambda f: f |
| import spaces |
| import gradio as gr |
| import transformers |
| from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig,AwqConfig |
| import torch |
| import os |
| import bitnet |
| key = os.environ.get("key") |
| from huggingface_hub import login |
| login(key) |
| from bitnet import replace_linears_in_hf |
| |
| |
| |
| nf4_config = BitsAndBytesConfig( |
| load_in_4bit=True, |
| bnb_4bit_quant_type="nf4", |
| bnb_4bit_use_double_quant=True, |
| bnb_4bit_compute_dtype=torch.bfloat16 |
| ) |
| model_id = "IEITYuan/Yuan2-M32-hf" |
| tokenizer = AutoTokenizer.from_pretrained(model_id) |
| model = AutoModelForCausalLM.from_pretrained(model_id, |
| |
| quantization_config=nf4_config, |
| |
| |
| device_map="auto", |
| trust_remote_code=True |
| ) |
|
|
| |
| model.eval() |
| @spaces.GPU |
| def generate_response(user_input, max_new_tokens, temperature): |
| os.system("nvidia-smi") |
| messages = [{"role": "user", "content": user_input}] |
| input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt") |
| input_ids = input_ids.to(model.device) |
| os.system("nvidia-smi") |
| gen_tokens = model.generate( |
| input_ids = input_ids, |
| max_new_tokens=max_new_tokens, |
| do_sample=True, |
| temperature=temperature, |
| ) |
|
|
| gen_text = tokenizer.decode(gen_tokens[0], skip_special_tokens=True) |
| if gen_text.startswith(user_input): |
| gen_text = gen_text[len(user_input):].lstrip() |
|
|
| return gen_text |
|
|
|
|
| |
| examples = [ |
| {"message": "What is the weather like today?", "max_new_tokens": 250, "temperature": 0.5}, |
| {"message": "Tell me a joke.", "max_new_tokens": 650, "temperature": 0.7}, |
| {"message": "Explain the concept of machine learning.", "max_new_tokens": 980, "temperature": 0.4} |
| ] |
| example_choices = [f"Example {i+1}" for i in range(len(examples))] |
|
|
| def load_example(choice): |
| index = example_choices.index(choice) |
| example = examples[index] |
| return example["message"], example["max_new_tokens"], example["temperature"] |
|
|
|
|
| with gr.Blocks() as demo: |
| with gr.Row(): |
| max_new_tokens_slider = gr.Slider(minimum=100, maximum=4000, value=980, label="Max New Tokens") |
| temperature_slider = gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.3, label="Temperature") |
| message_box = gr.Textbox(lines=2, label="Your Message") |
| generate_button = gr.Button("Try🫡Command-R") |
| output_box = gr.Textbox(label="🫡Command-R") |
|
|
| generate_button.click( |
| fn=generate_response, |
| inputs=[message_box, max_new_tokens_slider, temperature_slider], |
| outputs=output_box |
| ) |
| example_dropdown = gr.Dropdown(label="🫡Load Example", choices=example_choices) |
| example_button = gr.Button("🫡Load") |
| example_button.click( |
| fn=load_example, |
| inputs=example_dropdown, |
| outputs=[message_box, max_new_tokens_slider, temperature_slider] |
| ) |
|
|
| demo.launch() |
|
|