| --- |
| license: mit |
| language: |
| - en |
| library_name: transformers |
| tags: |
| - vision |
| - VISION-ENCODER-DECODER-MODEL |
| pipeline_tag: image-to-text |
| --- |
| |
| # ADD HEAD |
| ``` |
| |
| Mistral |
| VISION-ENCODER-DECODER-MODEL |
| |
| print('Add Vision...') |
| # ADD HEAD |
| # Combine pre-trained encoder and pre-trained decoder to form a Seq2Seq model |
| |
| |
| |
| Vmodel = VisionEncoderDecoderModel.from_encoder_decoder_pretrained( |
| "google/vit-base-patch16-224-in21k", "LeroyDyer/Mixtral_AI_Tiny" |
| ) |
| _Encoder_ImageProcessor = Vmodel.encoder |
| _Decoder_ImageTokenizer = Vmodel.decoder |
| _VisionEncoderDecoderModel = Vmodel |
| # Add Pad tokems |
| LM_MODEL.VisionEncoderDecoder = _VisionEncoderDecoderModel |
| # Add Sub Components |
| LM_MODEL.Encoder_ImageProcessor = _Encoder_ImageProcessor |
| LM_MODEL.Decoder_ImageTokenizer = _Decoder_ImageTokenizer |
| LM_MODEL |
| |
| |
| |
| ``` |