Spaces:

eder0782
/

localAi

Runtime error

App Files Files Community

localAi / core /backend /llm.go

eder0782

inicio

7def60a 7 months ago

raw

history blame contribute delete

5.45 kB

	package backend

	import (
	"context"
	"fmt"
	"os"
	"regexp"
	"strings"
	"sync"
	"unicode/utf8"

	"github.com/mudler/LocalAI/core/config"
	"github.com/mudler/LocalAI/core/schema"

	"github.com/mudler/LocalAI/core/gallery"
	"github.com/mudler/LocalAI/pkg/grpc"
	"github.com/mudler/LocalAI/pkg/grpc/proto"
	model "github.com/mudler/LocalAI/pkg/model"
	"github.com/mudler/LocalAI/pkg/utils"
	)

	type LLMResponse struct {
	Response string // should this be []byte?
	Usage TokenUsage
	}

	type TokenUsage struct {
	Prompt int
	Completion int
	}

	func ModelInference(ctx context.Context, s string, messages []schema.Message, images []string, loader model.ModelLoader, c config.BackendConfig, o config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
	modelFile := c.Model
	threads := c.Threads
	if *threads == 0 && o.Threads != 0 {
	threads = &o.Threads
	}
	grpcOpts := gRPCModelOpts(c)

	var inferenceModel grpc.Backend
	var err error

	opts := modelOpts(c, o, []model.Option{
	model.WithLoadGRPCLoadModelOpts(grpcOpts),
	model.WithThreads(uint32(*threads)), // some models uses this to allocate threads during startup
	model.WithAssetDir(o.AssetsDestination),
	model.WithModel(modelFile),
	model.WithContext(o.Context),
	})

	if c.Backend != "" {
	opts = append(opts, model.WithBackendString(c.Backend))
	}

	// Check if the modelFile exists, if it doesn't try to load it from the gallery
	if o.AutoloadGalleries { // experimental
	if _, err := os.Stat(modelFile); os.IsNotExist(err) {
	utils.ResetDownloadTimers()
	// if we failed to load the model, we try to download it
	err := gallery.InstallModelFromGallery(o.Galleries, modelFile, loader.ModelPath, gallery.GalleryModel{}, utils.DisplayDownloadFunction, o.EnforcePredownloadScans)
	if err != nil {
	return nil, err
	}
	}
	}

	if c.Backend == "" {
	inferenceModel, err = loader.GreedyLoader(opts...)
	} else {
	inferenceModel, err = loader.BackendLoader(opts...)
	}

	if err != nil {
	return nil, err
	}

	var protoMessages []*proto.Message
	// if we are using the tokenizer template, we need to convert the messages to proto messages
	// unless the prompt has already been tokenized (non-chat endpoints + functions)
	if c.TemplateConfig.UseTokenizerTemplate && s == "" {
	protoMessages = make([]*proto.Message, len(messages), len(messages))
	for i, message := range messages {
	protoMessages[i] = &proto.Message{
	Role: message.Role,
	}
	switch ct := message.Content.(type) {
	case string:
	protoMessages[i].Content = ct
	default:
	return nil, fmt.Errorf("Unsupported type for schema.Message.Content for inference: %T", ct)
	}
	}
	}

	// in GRPC, the backend is supposed to answer to 1 single token if stream is not supported
	fn := func() (LLMResponse, error) {
	opts := gRPCPredictOpts(c, loader.ModelPath)
	opts.Prompt = s
	opts.Messages = protoMessages
	opts.UseTokenizerTemplate = c.TemplateConfig.UseTokenizerTemplate
	opts.Images = images

	tokenUsage := TokenUsage{}

	// check the per-model feature flag for usage, since tokenCallback may have a cost.
	// Defaults to off as for now it is still experimental
	if c.FeatureFlag.Enabled("usage") {
	userTokenCallback := tokenCallback
	if userTokenCallback == nil {
	userTokenCallback = func(token string, usage TokenUsage) bool {
	return true
	}
	}

	promptInfo, pErr := inferenceModel.TokenizeString(ctx, opts)
	if pErr == nil && promptInfo.Length > 0 {
	tokenUsage.Prompt = int(promptInfo.Length)
	}

	tokenCallback = func(token string, usage TokenUsage) bool {
	tokenUsage.Completion++
	return userTokenCallback(token, tokenUsage)
	}
	}

	if tokenCallback != nil {
	ss := ""

	var partialRune []byte
	err := inferenceModel.PredictStream(ctx, opts, func(chars []byte) {
	partialRune = append(partialRune, chars...)

	for len(partialRune) > 0 {
	r, size := utf8.DecodeRune(partialRune)
	if r == utf8.RuneError {
	// incomplete rune, wait for more bytes
	break
	}

	tokenCallback(string(r), tokenUsage)
	ss += string(r)

	partialRune = partialRune[size:]
	}
	})
	return LLMResponse{
	Response: ss,
	Usage: tokenUsage,
	}, err
	} else {
	// TODO: Is the chicken bit the only way to get here? is that acceptable?
	reply, err := inferenceModel.Predict(ctx, opts)
	if err != nil {
	return LLMResponse{}, err
	}
	if tokenUsage.Prompt == 0 {
	tokenUsage.Prompt = int(reply.PromptTokens)
	}
	if tokenUsage.Completion == 0 {
	tokenUsage.Completion = int(reply.Tokens)
	}
	return LLMResponse{
	Response: string(reply.Message),
	Usage: tokenUsage,
	}, err
	}
	}

	return fn, nil
	}

	var cutstrings map[string]regexp.Regexp = make(map[string]regexp.Regexp)
	var mu sync.Mutex = sync.Mutex{}

	func Finetune(config config.BackendConfig, input, prediction string) string {
	if config.Echo {
	prediction = input + prediction
	}

	for _, c := range config.Cutstrings {
	mu.Lock()
	reg, ok := cutstrings[c]
	if !ok {
	cutstrings[c] = regexp.MustCompile(c)
	reg = cutstrings[c]
	}
	mu.Unlock()
	prediction = reg.ReplaceAllString(prediction, "")
	}

	for _, c := range config.TrimSpace {
	prediction = strings.TrimSpace(strings.TrimPrefix(prediction, c))
	}

	for _, c := range config.TrimSuffix {
	prediction = strings.TrimSpace(strings.TrimSuffix(prediction, c))
	}
	return prediction
	}