Basic Usage
Generate embeddings for text:Copy
import { embed } from '@core-ai/core-ai';
import { createOpenAI } from '@core-ai/openai';
const openai = createOpenAI({ apiKey: process.env.OPENAI_API_KEY });
const model = openai.embeddingModel('text-embedding-3-small');
const result = await embed({
model,
input: [
'TypeScript enables safer refactoring.',
'Runtime validation complements static typing.',
],
});
console.log('Embedding count:', result.embeddings.length);
console.log('Vector dimensions:', result.embeddings[0]?.length ?? 0);
console.log('First vector preview:', result.embeddings[0]?.slice(0, 8));
console.log('Usage:', result.usage);
Single vs Batch Embedding
Embed single or multiple texts:- Single Text
- Multiple Texts
Copy
const result = await embed({
model,
input: 'TypeScript provides type safety.',
});
// result.embeddings is an array with one vector
const vector = result.embeddings[0];
console.log('Dimensions:', vector.length);
Copy
const texts = [
'TypeScript provides type safety.',
'JavaScript is dynamically typed.',
'Python is a versatile language.',
'Rust offers memory safety.',
];
const result = await embed({ model, input: texts });
// result.embeddings has one vector per input
result.embeddings.forEach((vector, index) => {
console.log(`Text ${index}: ${vector.length} dimensions`);
});
Embedding Response
Theembed() function returns an EmbedResult:
Copy
type EmbedResult = {
embeddings: number[][]; // Array of vectors
usage?: EmbeddingUsage; // Token usage (if available)
};
type EmbeddingUsage = {
inputTokens: number; // Tokens consumed
};
Configuring Dimensions
Some models support custom dimensions:Copy
const result = await embed({
model,
input: 'Your text here',
dimensions: 256, // Reduce dimensions for smaller vectors
});
console.log('Dimensions:', result.embeddings[0].length); // 256
Not all embedding models support custom dimensions. Check your provider’s documentation.
Similarity Search
Calculate similarity between vectors:Copy
import { embed } from '@core-ai/core-ai';
// Cosine similarity function
function cosineSimilarity(a: number[], b: number[]): number {
const dotProduct = a.reduce((sum, val, i) => sum + val * b[i], 0);
const magnitudeA = Math.sqrt(a.reduce((sum, val) => sum + val * val, 0));
const magnitudeB = Math.sqrt(b.reduce((sum, val) => sum + val * val, 0));
return dotProduct / (magnitudeA * magnitudeB);
}
// Embed query and documents
const query = 'What is TypeScript?';
const documents = [
'TypeScript is a typed superset of JavaScript.',
'Python is a high-level programming language.',
'JavaScript runs in web browsers.',
'TypeScript adds static types to JavaScript.',
];
const result = await embed({
model,
input: [query, ...documents],
});
const queryVector = result.embeddings[0];
const docVectors = result.embeddings.slice(1);
// Calculate similarities
const similarities = docVectors.map((docVector, index) => ({
document: documents[index],
similarity: cosineSimilarity(queryVector, docVector),
}));
// Sort by similarity
similarities.sort((a, b) => b.similarity - a.similarity);
console.log('Most similar documents:');
similarities.forEach(({ document, similarity }) => {
console.log(`${similarity.toFixed(4)}: ${document}`);
});
Semantic Search Example
Build a simple semantic search system:Copy
import { embed } from '@core-ai/core-ai';
type Document = {
id: string;
text: string;
embedding?: number[];
};
class SemanticSearch {
constructor(
private model: EmbeddingModel,
private documents: Document[] = []
) {}
async indexDocuments(documents: Document[]): Promise<void> {
const texts = documents.map((doc) => doc.text);
const result = await embed({ model: this.model, input: texts });
this.documents = documents.map((doc, i) => ({
...doc,
embedding: result.embeddings[i],
}));
}
async search(query: string, topK: number = 5): Promise<Document[]> {
const result = await embed({ model: this.model, input: query });
const queryVector = result.embeddings[0];
const scored = this.documents
.filter((doc) => doc.embedding)
.map((doc) => ({
...doc,
similarity: cosineSimilarity(queryVector, doc.embedding!),
}))
.sort((a, b) => b.similarity - a.similarity)
.slice(0, topK);
return scored;
}
}
function cosineSimilarity(a: number[], b: number[]): number {
const dotProduct = a.reduce((sum, val, i) => sum + val * b[i], 0);
const magnitudeA = Math.sqrt(a.reduce((sum, val) => sum + val * val, 0));
const magnitudeB = Math.sqrt(b.reduce((sum, val) => sum + val * val, 0));
return dotProduct / (magnitudeA * magnitudeB);
}
// Usage
const search = new SemanticSearch(model);
await search.indexDocuments([
{ id: '1', text: 'TypeScript adds static types to JavaScript' },
{ id: '2', text: 'Python is great for data science' },
{ id: '3', text: 'JavaScript runs in web browsers' },
{ id: '4', text: 'TypeScript compiles to JavaScript' },
]);
const results = await search.search('typed JavaScript', 2);
console.log('Search results:', results);
Clustering Documents
Group similar documents together:Copy
import { embed } from '@core-ai/core-ai';
type Cluster = {
centroid: number[];
documents: Array<{ text: string; vector: number[] }>;
};
function kMeansClustering(
vectors: number[][],
texts: string[],
k: number,
iterations: number = 10
): Cluster[] {
// Initialize random centroids
let centroids = vectors.slice(0, k);
for (let iter = 0; iter < iterations; iter++) {
// Assign vectors to nearest centroid
const clusters: Cluster[] = centroids.map((centroid) => ({
centroid,
documents: [],
}));
vectors.forEach((vector, index) => {
const similarities = centroids.map((centroid) =>
cosineSimilarity(vector, centroid)
);
const nearest = similarities.indexOf(Math.max(...similarities));
clusters[nearest].documents.push({ text: texts[index], vector });
});
// Update centroids
centroids = clusters.map((cluster) => {
const dims = cluster.documents[0].vector.length;
return Array.from({ length: dims }, (_, i) =>
cluster.documents.reduce((sum, doc) => sum + doc.vector[i], 0) /
cluster.documents.length
);
});
}
return centroids.map((centroid) => ({
centroid,
documents: vectors
.map((vector, index) => ({ text: texts[index], vector }))
.filter(
(doc) =>
centroids.indexOf(
centroids.reduce((nearest, c) =>
cosineSimilarity(doc.vector, c) >
cosineSimilarity(doc.vector, nearest)
? c
: nearest
)
) === centroids.indexOf(centroid)
),
}));
}
function cosineSimilarity(a: number[], b: number[]): number {
const dotProduct = a.reduce((sum, val, i) => sum + val * b[i], 0);
const magnitudeA = Math.sqrt(a.reduce((sum, val) => sum + val * val, 0));
const magnitudeB = Math.sqrt(b.reduce((sum, val) => sum + val * val, 0));
return dotProduct / (magnitudeA * magnitudeB);
}
// Example usage
const documents = [
'TypeScript adds types to JavaScript',
'Python is used for data science',
'JavaScript powers web applications',
'TypeScript provides better tooling',
'Python has great ML libraries',
];
const result = await embed({ model, input: documents });
const clusters = kMeansClustering(result.embeddings, documents, 2);
clusters.forEach((cluster, i) => {
console.log(`\nCluster ${i + 1}:`);
cluster.documents.forEach((doc) => console.log(` - ${doc.text}`));
});
Using Different Providers
- OpenAI
- Other Providers
Copy
import { embed } from '@core-ai/core-ai';
import { createOpenAI } from '@core-ai/openai';
const openai = createOpenAI({
apiKey: process.env.OPENAI_API_KEY
});
const model = openai.embeddingModel('text-embedding-3-small');
const result = await embed({
model,
input: 'Your text here',
});
Check your provider’s documentation for embedding model support:
Copy
// Example pattern
const provider = createProvider({ apiKey });
const model = provider.embeddingModel('model-name');
const result = await embed({ model, input: 'text' });
Integration with Vector Databases
Store embeddings in vector databases:- Pinecone
- Weaviate
- Qdrant
Copy
import { embed } from '@core-ai/core-ai';
import { Pinecone } from '@pinecone-database/pinecone';
const pinecone = new Pinecone({ apiKey: process.env.PINECONE_API_KEY });
const index = pinecone.Index('my-index');
// Generate embeddings
const result = await embed({
model,
input: ['Document 1 text', 'Document 2 text'],
});
// Upsert to Pinecone
await index.upsert([
{
id: 'doc1',
values: result.embeddings[0],
metadata: { text: 'Document 1 text' },
},
{
id: 'doc2',
values: result.embeddings[1],
metadata: { text: 'Document 2 text' },
},
]);
// Query similar vectors
const queryResult = await embed({ model, input: 'search query' });
const matches = await index.query({
vector: queryResult.embeddings[0],
topK: 5,
includeMetadata: true,
});
Copy
import { embed } from '@core-ai/core-ai';
import weaviate from 'weaviate-ts-client';
const client = weaviate.client({
scheme: 'http',
host: 'localhost:8080',
});
const texts = ['Document 1', 'Document 2'];
const result = await embed({ model, input: texts });
// Store embeddings
for (let i = 0; i < texts.length; i++) {
await client.data
.creator()
.withClassName('Document')
.withProperties({ text: texts[i] })
.withVector(result.embeddings[i])
.do();
}
Copy
import { embed } from '@core-ai/core-ai';
import { QdrantClient } from '@qdrant/js-client-rest';
const client = new QdrantClient({ url: 'http://localhost:6333' });
const texts = ['Document 1', 'Document 2'];
const result = await embed({ model, input: texts });
await client.upsert('my_collection', {
points: texts.map((text, i) => ({
id: i,
vector: result.embeddings[i],
payload: { text },
})),
});
Best Practices
Batch embeddings for efficiency
Batch embeddings for efficiency
Process multiple texts in a single request:
Copy
// Good: batch processing
const texts = ['Text 1', 'Text 2', 'Text 3'];
const result = await embed({ model, input: texts });
// Avoid: individual requests
// for (const text of texts) {
// await embed({ model, input: text });
// }
Cache embeddings
Cache embeddings
Embeddings are deterministic - cache them:
Copy
const cache = new Map<string, number[]>();
async function getEmbedding(text: string): Promise<number[]> {
if (cache.has(text)) {
return cache.get(text)!;
}
const result = await embed({ model, input: text });
const vector = result.embeddings[0];
cache.set(text, vector);
return vector;
}
Normalize vectors for cosine similarity
Normalize vectors for cosine similarity
Pre-normalize vectors for faster similarity:
Copy
function normalizeVector(vector: number[]): number[] {
const magnitude = Math.sqrt(
vector.reduce((sum, val) => sum + val * val, 0)
);
return vector.map((val) => val / magnitude);
}
function dotProduct(a: number[], b: number[]): number {
return a.reduce((sum, val, i) => sum + val * b[i], 0);
}
// Normalize once
const normalizedVectors = vectors.map(normalizeVector);
// Fast similarity (just dot product)
const similarity = dotProduct(normalizedVectors[0], normalizedVectors[1]);
Use appropriate chunk sizes
Use appropriate chunk sizes
Split long texts into chunks:
Copy
function chunkText(text: string, maxLength: number = 500): string[] {
const words = text.split(' ');
const chunks: string[] = [];
let current = '';
for (const word of words) {
if ((current + ' ' + word).length > maxLength) {
chunks.push(current.trim());
current = word;
} else {
current += (current ? ' ' : '') + word;
}
}
if (current) chunks.push(current.trim());
return chunks;
}
const chunks = chunkText(longDocument);
const result = await embed({ model, input: chunks });