File size: 3,303 Bytes
6f25f68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
/**
 * A class that helps with chunking streaming responing
 * from an LLM into whole sentences (or as close as we can get).
 */
export class SentenceChunker {
    /**
     * @param {Object} options
     * @param {number} options.chunkLength - The maximum length of a chunk (default: 96)
     * @param {boolean} options.emitParagraphs - Whether to emit paragraphs as chunks (default: true)
     */
    constructor(options = {}) {
        this.buffer = "";
        this.chunkLength = options.chunkLength || 128;
        this.emitParagraphs = options.emitParagraphs !== false;
        this.callbacks = [];
    }

    /**
     * Emit a chunk of text
     * @param {string} output - The chunk of text to emit
     */
    emit(output) {
        this.callbacks.forEach(cb => cb(output));
    }

    /**
     * Register a callback to be called when a chunk is emitted
     * @param {Function} callback - The callback to call
     */
    onChunk(callback) {
        this.callbacks.push(callback);
    }

    /**
     * Push new data into the chunker
     * @param {string} data - The new data to push
     */
    push(data) {
        let paragraphs = data.split(/(\n+)/);
        let numParagraphs = paragraphs.length;
        for (let i = 0; i < numParagraphs; i++) {
            let paragraph = paragraphs[i];
            if (!paragraph) {
                continue;
            }
            let sentences = paragraph.split(/(?<=[;:,.!?]\s+)|(?<=[;:,。!?])/);
            let bufferLength = this.buffer.length;
            for (let sentence of sentences) {
                let sentenceLength = sentence.length;
                if (sentenceLength === 0) {
                    continue;
                }
                if (bufferLength + sentenceLength <= this.chunkLength) {
                    this.buffer += sentence;
                    bufferLength += sentenceLength;
                } else {
                    if (bufferLength > 0) {
                        this.emit(this.buffer);
                    }
                    this.buffer = sentence;
                    bufferLength = sentenceLength;
                }
            }

            if (this.emitParagraphs && numParagraphs > 1 && i < numParagraphs - 1) {
                this.emit(this.buffer);
                this.buffer = "";
            }
        }
    }

    /**
     * Flush the buffer, emitting any remaining text
     */
    flush() {
        if (this.buffer.length > 0) {
            this.emit(this.buffer);
            this.buffer = "";
        }
    }
}

/**
 * A SentenceChunker that can handle streaming responses that grow over time
 * (e.g. when the LLM is still generating a response and concatenating it to the previous response)
 */
export class GrowingSentenceChunker extends SentenceChunker {
    constructor(options = {}) {
        super(options);
        this.partialSentence = "";
    }

    /**
     * Push new data into the chunker
     * @param {string} data - The new data to push
     */
    push(data) {
        const newData = data.substring(this.partialSentence.length);
        this.partialSentence += newData;
        super.push(newData);
    }

    /**
     * Flush the buffer, emitting any remaining text
     */
    flush() {
        super.flush();
        this.partialSentence = "";
    }
}