feat: add oceanbase vector support (#1813)
parent
a6675c65de
commit
3bd7340d46
@ -0,0 +1,158 @@ |
||||
/* |
||||
* Copyright 2025 coze-dev Authors |
||||
* |
||||
* Licensed under the Apache License, Version 2.0 (the "License"); |
||||
* you may not use this file except in compliance with the License. |
||||
* You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
package oceanbase |
||||
|
||||
import ( |
||||
"fmt" |
||||
"time" |
||||
) |
||||
|
||||
const ( |
||||
defaultBatchSize = 100 |
||||
defaultTopK = 10 |
||||
defaultVectorDimension = 2048 |
||||
defaultVectorMemoryLimitPercentage = 30 |
||||
defaultMaxOpenConns = 100 |
||||
defaultMaxIdleConns = 10 |
||||
defaultConnMaxLifetime = 3600 |
||||
defaultConnMaxIdleTime = 1800 |
||||
defaultCacheTTL = 300 |
||||
defaultConnTimeout = 30 |
||||
defaultMaxRetries = 3 |
||||
defaultRetryDelay = 1 |
||||
maxVectorDimension = 4096 |
||||
maxCollectionNameLength = 255 |
||||
maxSQLIdentifierLength = 64 |
||||
maxContentLength = 65535 |
||||
maxBatchSize = 1000 |
||||
|
||||
enableCacheDefault = true |
||||
enableMetricsDefault = true |
||||
enableSlowQueryLogDefault = true |
||||
slowQueryThreshold = 1000 |
||||
|
||||
ErrCodeInvalidConfig = "INVALID_CONFIG" |
||||
ErrCodeConnectionFailed = "CONNECTION_FAILED" |
||||
ErrCodeQueryTimeout = "QUERY_TIMEOUT" |
||||
ErrCodeVectorDimensionMismatch = "VECTOR_DIMENSION_MISMATCH" |
||||
ErrCodeCollectionNotFound = "COLLECTION_NOT_FOUND" |
||||
ErrCodeDuplicateCollection = "DUPLICATE_COLLECTION" |
||||
) |
||||
|
||||
type Config struct { |
||||
Host string |
||||
Port int |
||||
User string |
||||
Password string |
||||
Database string |
||||
|
||||
VectorDimension int |
||||
MetricType string |
||||
MaxOpenConns int |
||||
MaxIdleConns int |
||||
ConnMaxLifetime time.Duration |
||||
ConnMaxIdleTime time.Duration |
||||
VectorMemoryLimitPercentage int |
||||
BatchSize int |
||||
|
||||
EnableCache bool |
||||
CacheTTL time.Duration |
||||
EnableMetrics bool |
||||
EnableSlowQueryLog bool |
||||
MaxRetries int |
||||
RetryDelay time.Duration |
||||
ConnTimeout time.Duration |
||||
} |
||||
|
||||
func DefaultConfig() *Config { |
||||
return &Config{ |
||||
Host: getEnv("OCEANBASE_HOST", "localhost"), |
||||
Port: getEnvAsInt("OCEANBASE_PORT", 2881), |
||||
User: getEnv("OCEANBASE_USER", "root"), |
||||
Password: getEnv("OCEANBASE_PASSWORD", ""), |
||||
Database: getEnv("OCEANBASE_DATABASE", "test"), |
||||
VectorDimension: getVectorDimension(), |
||||
MetricType: "cosine", |
||||
MaxOpenConns: getEnvAsInt("OCEANBASE_MAX_OPEN_CONNS", defaultMaxOpenConns), |
||||
MaxIdleConns: getEnvAsInt("OCEANBASE_MAX_IDLE_CONNS", defaultMaxIdleConns), |
||||
ConnMaxLifetime: time.Duration(getEnvAsInt("OCEANBASE_CONN_MAX_LIFETIME", defaultConnMaxLifetime)) * time.Second, |
||||
ConnMaxIdleTime: time.Duration(getEnvAsInt("OCEANBASE_CONN_MAX_IDLE_TIME", defaultConnMaxIdleTime)) * time.Second, |
||||
VectorMemoryLimitPercentage: getEnvAsInt("OCEANBASE_VECTOR_MEMORY_LIMIT_PERCENTAGE", defaultVectorMemoryLimitPercentage), |
||||
BatchSize: getEnvAsInt("OCEANBASE_BATCH_SIZE", defaultBatchSize), |
||||
EnableCache: getEnvAsBool("OCEANBASE_ENABLE_CACHE", enableCacheDefault), |
||||
CacheTTL: time.Duration(getEnvAsInt("OCEANBASE_CACHE_TTL", defaultCacheTTL)) * time.Second, |
||||
EnableMetrics: getEnvAsBool("OCEANBASE_ENABLE_METRICS", enableMetricsDefault), |
||||
EnableSlowQueryLog: getEnvAsBool("OCEANBASE_ENABLE_SLOW_QUERY_LOG", enableSlowQueryLogDefault), |
||||
MaxRetries: getEnvAsInt("OCEANBASE_MAX_RETRIES", defaultMaxRetries), |
||||
RetryDelay: time.Duration(getEnvAsInt("OCEANBASE_RETRY_DELAY", defaultRetryDelay)) * time.Second, |
||||
ConnTimeout: time.Duration(getEnvAsInt("OCEANBASE_CONN_TIMEOUT", defaultConnTimeout)) * time.Second, |
||||
} |
||||
} |
||||
|
||||
func (c *Config) Validate() error { |
||||
if c.Host == "" { |
||||
return fmt.Errorf("host cannot be empty") |
||||
} |
||||
if c.Port <= 0 || c.Port > 65535 { |
||||
return fmt.Errorf("port must be between 1 and 65535") |
||||
} |
||||
if c.User == "" { |
||||
return fmt.Errorf("user cannot be empty") |
||||
} |
||||
if c.Database == "" { |
||||
return fmt.Errorf("database cannot be empty") |
||||
} |
||||
if c.VectorDimension <= 0 || c.VectorDimension > maxVectorDimension { |
||||
return fmt.Errorf("vector dimension must be between 1 and %d", maxVectorDimension) |
||||
} |
||||
if c.BatchSize <= 0 || c.BatchSize > maxBatchSize { |
||||
return fmt.Errorf("batch size must be between 1 and %d", maxBatchSize) |
||||
} |
||||
if c.MaxOpenConns <= 0 { |
||||
return fmt.Errorf("max open connections must be positive") |
||||
} |
||||
if c.MaxIdleConns <= 0 || c.MaxIdleConns > c.MaxOpenConns { |
||||
return fmt.Errorf("max idle connections must be positive and not greater than max open connections") |
||||
} |
||||
if c.CacheTTL <= 0 { |
||||
return fmt.Errorf("cache TTL must be positive") |
||||
} |
||||
if c.MaxRetries < 0 { |
||||
return fmt.Errorf("max retries cannot be negative") |
||||
} |
||||
if c.RetryDelay < 0 { |
||||
return fmt.Errorf("retry delay cannot be negative") |
||||
} |
||||
if c.ConnTimeout <= 0 { |
||||
return fmt.Errorf("connection timeout must be positive") |
||||
} |
||||
return nil |
||||
} |
||||
|
||||
func getVectorDimension() int { |
||||
if dims := getEnvAsInt("ARK_EMBEDDING_DIMS", 0); dims > 0 { |
||||
return dims |
||||
} |
||||
if dims := getEnvAsInt("OPENAI_EMBEDDING_DIMS", 0); dims > 0 { |
||||
return dims |
||||
} |
||||
return defaultVectorDimension |
||||
} |
||||
|
||||
|
||||
|
||||
|
@ -0,0 +1,239 @@ |
||||
/* |
||||
* Copyright 2025 coze-dev Authors |
||||
* |
||||
* Licensed under the Apache License, Version 2.0 (the "License"); |
||||
* you may not use this file except in compliance with the License. |
||||
* You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
package oceanbase |
||||
|
||||
import ( |
||||
"encoding/json" |
||||
"fmt" |
||||
"math" |
||||
"regexp" |
||||
"strings" |
||||
"unicode" |
||||
|
||||
"github.com/cloudwego/eino/schema" |
||||
) |
||||
|
||||
func TableName(collectionName string) string { |
||||
cleanName := strings.Map(func(r rune) rune { |
||||
if (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') || r == '_' { |
||||
return r |
||||
} |
||||
return '_' |
||||
}, collectionName) |
||||
return fmt.Sprintf("vector_%s", strings.ToLower(cleanName)) |
||||
} |
||||
|
||||
func ExtractContent(doc *schema.Document) string { |
||||
if doc.Content != "" { |
||||
return strings.TrimSpace(doc.Content) |
||||
} |
||||
if doc.MetaData != nil { |
||||
if content, ok := doc.MetaData["content"].(string); ok && content != "" { |
||||
return strings.TrimSpace(content) |
||||
} |
||||
if text, ok := doc.MetaData["text"].(string); ok && text != "" { |
||||
return strings.TrimSpace(text) |
||||
} |
||||
} |
||||
return "" |
||||
} |
||||
|
||||
func BuildMetadata(doc *schema.Document) map[string]interface{} { |
||||
metadata := make(map[string]interface{}) |
||||
if doc.MetaData != nil { |
||||
for k, v := range doc.MetaData { |
||||
metadata[k] = v |
||||
} |
||||
} |
||||
metadata["document_id"] = doc.ID |
||||
metadata["content"] = doc.Content |
||||
metadata["content_length"] = len(doc.Content) |
||||
return metadata |
||||
} |
||||
|
||||
func MetadataToJSON(metadata map[string]interface{}) (string, error) { |
||||
if metadata == nil { |
||||
return "{}", nil |
||||
} |
||||
jsonBytes, err := json.Marshal(metadata) |
||||
if err != nil { |
||||
return "", fmt.Errorf("failed to marshal metadata: %w", err) |
||||
} |
||||
return string(jsonBytes), nil |
||||
} |
||||
|
||||
func JSONToMetadata(jsonStr string) (map[string]interface{}, error) { |
||||
if jsonStr == "" { |
||||
return make(map[string]interface{}), nil |
||||
} |
||||
var metadata map[string]interface{} |
||||
if err := json.Unmarshal([]byte(jsonStr), &metadata); err != nil { |
||||
return nil, fmt.Errorf("failed to unmarshal metadata: %w", err) |
||||
} |
||||
return metadata, nil |
||||
} |
||||
|
||||
func ValidateCollectionName(name string) error { |
||||
if name == "" { |
||||
return fmt.Errorf("collection name cannot be empty") |
||||
} |
||||
if len(name) > maxCollectionNameLength { |
||||
return fmt.Errorf("collection name too long (max %d characters)", maxCollectionNameLength) |
||||
} |
||||
|
||||
if len(name) > 0 && unicode.IsDigit(rune(name[0])) { |
||||
return fmt.Errorf("collection name cannot start with a digit") |
||||
} |
||||
|
||||
for _, r := range name { |
||||
if !((r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') || r == '_' || r == '-') { |
||||
return fmt.Errorf("collection name contains invalid character: %c", r) |
||||
} |
||||
} |
||||
|
||||
if isReservedWord(name) { |
||||
return fmt.Errorf("collection name is a reserved word: %s", name) |
||||
} |
||||
|
||||
return nil |
||||
} |
||||
|
||||
func BuildInClause(values []string) string { |
||||
if len(values) == 0 { |
||||
return "()" |
||||
} |
||||
quoted := make([]string, len(values)) |
||||
for i, v := range values { |
||||
quoted[i] = fmt.Sprintf("'%s'", v) |
||||
} |
||||
return fmt.Sprintf("(%s)", strings.Join(quoted, ",")) |
||||
} |
||||
|
||||
func ConvertToFloat32(f64 []float64) []float32 { |
||||
f32 := make([]float32, len(f64)) |
||||
for i, v := range f64 { |
||||
f32[i] = float32(v) |
||||
} |
||||
return f32 |
||||
} |
||||
|
||||
func ConvertToFloat64(f32 []float32) []float64 { |
||||
f64 := make([]float64, len(f32)) |
||||
for i, v := range f32 { |
||||
f64[i] = float64(v) |
||||
} |
||||
return f64 |
||||
} |
||||
|
||||
func SanitizeString(s string) string { |
||||
s = strings.Map(func(r rune) rune { |
||||
if r < 32 || r == 127 { |
||||
return -1 |
||||
} |
||||
return r |
||||
}, s) |
||||
|
||||
s = strings.Join(strings.Fields(s), " ") |
||||
|
||||
return strings.TrimSpace(s) |
||||
} |
||||
|
||||
func TruncateString(s string, maxLength int) string { |
||||
if len(s) <= maxLength { |
||||
return s |
||||
} |
||||
return s[:maxLength-3] + "..." |
||||
} |
||||
|
||||
func IsValidVector(vector []float32) error { |
||||
if len(vector) == 0 { |
||||
return fmt.Errorf("vector cannot be empty") |
||||
} |
||||
if len(vector) > maxVectorDimension { |
||||
return fmt.Errorf("vector dimension too large (max %d)", maxVectorDimension) |
||||
} |
||||
|
||||
for i, v := range vector { |
||||
if v != v { // NaN check
|
||||
return fmt.Errorf("vector contains NaN at index %d", i) |
||||
} |
||||
} |
||||
|
||||
return nil |
||||
} |
||||
|
||||
func NormalizeVector(vector []float32) []float32 { |
||||
if len(vector) == 0 { |
||||
return vector |
||||
} |
||||
|
||||
var sum float32 |
||||
for _, v := range vector { |
||||
sum += v * v |
||||
} |
||||
|
||||
if sum == 0 { |
||||
return vector |
||||
} |
||||
|
||||
norm := float32(1.0 / math.Sqrt(float64(sum))) |
||||
normalized := make([]float32, len(vector)) |
||||
for i, v := range vector { |
||||
normalized[i] = v * norm |
||||
} |
||||
|
||||
return normalized |
||||
} |
||||
|
||||
var reservedWords = map[string]bool{ |
||||
"select": true, "from": true, "where": true, "insert": true, "update": true, |
||||
"delete": true, "drop": true, "create": true, "alter": true, "table": true, |
||||
"index": true, "primary": true, "foreign": true, "key": true, "constraint": true, |
||||
"order": true, "by": true, "group": true, "having": true, "union": true, |
||||
"all": true, "distinct": true, "as": true, "in": true, "between": true, |
||||
"like": true, "is": true, "null": true, "not": true, "and": true, "or": true, |
||||
"vector": true, "embedding": true, "collection": true, |
||||
} |
||||
|
||||
func isReservedWord(name string) bool { |
||||
return reservedWords[strings.ToLower(name)] |
||||
} |
||||
|
||||
func GenerateTableName(collectionName string, suffix string) string { |
||||
baseName := TableName(collectionName) |
||||
if suffix != "" { |
||||
return fmt.Sprintf("%s_%s", baseName, suffix) |
||||
} |
||||
return baseName |
||||
} |
||||
|
||||
func ValidateSQLIdentifier(identifier string) error { |
||||
if identifier == "" { |
||||
return fmt.Errorf("SQL identifier cannot be empty") |
||||
} |
||||
|
||||
if len(identifier) > 64 { |
||||
return fmt.Errorf("SQL identifier too long (max 64 characters)") |
||||
} |
||||
|
||||
matched, _ := regexp.MatchString(`^[a-zA-Z_][a-zA-Z0-9_]*$`, identifier) |
||||
if !matched { |
||||
return fmt.Errorf("SQL identifier format invalid: %s", identifier) |
||||
} |
||||
|
||||
return nil |
||||
} |
@ -0,0 +1,80 @@ |
||||
/* |
||||
* Copyright 2025 coze-dev Authors |
||||
* |
||||
* Licensed under the Apache License, Version 2.0 (the "License"); |
||||
* you may not use this file except in compliance with the License. |
||||
* You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
package oceanbase |
||||
|
||||
import ( |
||||
"context" |
||||
"fmt" |
||||
|
||||
"github.com/coze-dev/coze-studio/backend/infra/contract/document/searchstore" |
||||
"github.com/coze-dev/coze-studio/backend/infra/contract/embedding" |
||||
"github.com/coze-dev/coze-studio/backend/infra/impl/oceanbase" |
||||
"github.com/coze-dev/coze-studio/backend/pkg/logs" |
||||
) |
||||
|
||||
type Factory struct { |
||||
config *Config |
||||
} |
||||
|
||||
func NewFactory(config *Config) *Factory { |
||||
return &Factory{ |
||||
config: config, |
||||
} |
||||
} |
||||
|
||||
func (f *Factory) CreateManager(ctx context.Context, embedder embedding.Embedder) (searchstore.Manager, error) { |
||||
if err := f.config.Validate(); err != nil { |
||||
return nil, fmt.Errorf("invalid configuration: %w", err) |
||||
} |
||||
|
||||
dsn := fmt.Sprintf("%s:%s@tcp(%s:%d)/%s?charset=utf8mb4&parseTime=True&loc=Local", |
||||
f.config.User, f.config.Password, f.config.Host, f.config.Port, f.config.Database) |
||||
|
||||
client, err := oceanbase.NewOceanBaseClient(dsn) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("failed to create OceanBase client: %w", err) |
||||
} |
||||
|
||||
managerConfig := &ManagerConfig{ |
||||
Client: client, |
||||
Embedding: embedder, |
||||
BatchSize: f.config.BatchSize, |
||||
EnableCache: f.config.EnableCache, |
||||
CacheTTL: f.config.CacheTTL, |
||||
MaxConnections: f.config.MaxOpenConns, |
||||
ConnTimeout: f.config.ConnTimeout, |
||||
} |
||||
|
||||
manager, err := NewManager(managerConfig) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("failed to create OceanBase manager: %w", err) |
||||
} |
||||
|
||||
logs.CtxInfof(ctx, "Created OceanBase vector store manager with config: %s:%d/%s (dimension: %d, cache: %v, batchSize: %d)", |
||||
f.config.Host, f.config.Port, f.config.Database, f.config.VectorDimension, |
||||
f.config.EnableCache, f.config.BatchSize) |
||||
|
||||
return manager, nil |
||||
} |
||||
|
||||
func (f *Factory) GetType() searchstore.SearchStoreType { |
||||
return searchstore.TypeVectorStore |
||||
} |
||||
|
||||
func (f *Factory) GetConfig() *Config { |
||||
return f.config |
||||
} |
@ -0,0 +1,367 @@ |
||||
/* |
||||
* Copyright 2025 coze-dev Authors |
||||
* |
||||
* Licensed under the Apache License, Version 2.0 (the "License"); |
||||
* you may not use this file except in compliance with the License. |
||||
* You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
package oceanbase |
||||
|
||||
import ( |
||||
"context" |
||||
"fmt" |
||||
"os" |
||||
"strconv" |
||||
"sync" |
||||
"time" |
||||
|
||||
"github.com/coze-dev/coze-studio/backend/infra/contract/document/searchstore" |
||||
"github.com/coze-dev/coze-studio/backend/infra/contract/embedding" |
||||
"github.com/coze-dev/coze-studio/backend/infra/impl/oceanbase" |
||||
"github.com/coze-dev/coze-studio/backend/pkg/logs" |
||||
) |
||||
|
||||
type ManagerConfig struct { |
||||
Client *oceanbase.OceanBaseClient |
||||
Embedding embedding.Embedder |
||||
BatchSize int |
||||
|
||||
EnableCache bool |
||||
CacheTTL time.Duration |
||||
MaxConnections int |
||||
ConnTimeout time.Duration |
||||
|
||||
EnableConnectionPool bool |
||||
PoolMaxIdle int |
||||
PoolMaxLifetime time.Duration |
||||
QueryTimeout time.Duration |
||||
MaxRetries int // optional: default 3
|
||||
RetryDelay time.Duration // optional: default 1s
|
||||
} |
||||
|
||||
// Create an OceanBase vector storage manager
|
||||
func NewManager(config *ManagerConfig) (searchstore.Manager, error) { |
||||
if config.Client == nil { |
||||
return nil, fmt.Errorf("[NewManager] oceanbase client not provided") |
||||
} |
||||
if config.Embedding == nil { |
||||
return nil, fmt.Errorf("[NewManager] oceanbase embedder not provided") |
||||
} |
||||
|
||||
if config.BatchSize == 0 { |
||||
config.BatchSize = defaultBatchSize |
||||
} |
||||
|
||||
if config.CacheTTL == 0 { |
||||
config.CacheTTL = 5 * time.Minute |
||||
} |
||||
if config.MaxConnections == 0 { |
||||
config.MaxConnections = defaultMaxOpenConns |
||||
} |
||||
if config.ConnTimeout == 0 { |
||||
config.ConnTimeout = 30 * time.Second |
||||
} |
||||
|
||||
|
||||
if config.PoolMaxIdle == 0 { |
||||
config.PoolMaxIdle = 10 |
||||
} |
||||
if config.PoolMaxLifetime == 0 { |
||||
config.PoolMaxLifetime = 1 * time.Hour |
||||
} |
||||
if config.QueryTimeout == 0 { |
||||
config.QueryTimeout = 30 * time.Second |
||||
} |
||||
if config.MaxRetries == 0 { |
||||
config.MaxRetries = 3 |
||||
} |
||||
if config.RetryDelay == 0 { |
||||
config.RetryDelay = 1 * time.Second |
||||
} |
||||
|
||||
manager := &oceanbaseManager{ |
||||
config: config, |
||||
cache: make(map[string]*cachedSearchStore), |
||||
mu: &sync.RWMutex{}, |
||||
} |
||||
|
||||
if config.EnableCache { |
||||
go manager.startCacheCleaner() |
||||
} |
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), config.ConnTimeout) |
||||
defer cancel() |
||||
|
||||
if err := config.Client.InitDatabase(ctx); err != nil { |
||||
logs.CtxWarnf(ctx, "Failed to initialize OceanBase database: %v", err) |
||||
} |
||||
|
||||
logs.CtxInfof(ctx, "Created OceanBase vector store manager with cache=%v, batchSize=%d, pool=%v", |
||||
config.EnableCache, config.BatchSize, config.EnableConnectionPool) |
||||
return manager, nil |
||||
} |
||||
|
||||
type oceanbaseManager struct { |
||||
config *ManagerConfig |
||||
cache map[string]*cachedSearchStore |
||||
mu *sync.RWMutex |
||||
} |
||||
|
||||
// cachedSearchStore 缓存的搜索存储
|
||||
type cachedSearchStore struct { |
||||
store searchstore.SearchStore |
||||
lastUsed time.Time |
||||
} |
||||
|
||||
func (m *oceanbaseManager) Create(ctx context.Context, req *searchstore.CreateRequest) error { |
||||
if err := ValidateCollectionName(req.CollectionName); err != nil { |
||||
return fmt.Errorf("[Create] invalid collection name: %w", err) |
||||
} |
||||
|
||||
tableName := TableName(req.CollectionName) |
||||
|
||||
dimension := m.getVectorDimension() |
||||
|
||||
logs.CtxInfof(ctx, "[Create] Using dimension: %d for collection: %s", dimension, req.CollectionName) |
||||
|
||||
if err := m.config.Client.CreateCollection(ctx, req.CollectionName, dimension); err != nil { |
||||
return fmt.Errorf("[Create] create vector collection failed: %w", err) |
||||
} |
||||
|
||||
if err := m.recordCollection(ctx, req.CollectionName, tableName); err != nil { |
||||
logs.CtxWarnf(ctx, "Failed to record collection: %v", err) |
||||
} |
||||
|
||||
m.clearCache(req.CollectionName) |
||||
|
||||
logs.CtxInfof(ctx, "Created OceanBase collection: %s (table: %s)", req.CollectionName, tableName) |
||||
return nil |
||||
} |
||||
|
||||
func (m *oceanbaseManager) Drop(ctx context.Context, req *searchstore.DropRequest) error { |
||||
if err := ValidateCollectionName(req.CollectionName); err != nil { |
||||
return fmt.Errorf("[Drop] invalid collection name: %w", err) |
||||
} |
||||
|
||||
tableName := TableName(req.CollectionName) |
||||
|
||||
if err := m.config.Client.DropCollection(ctx, req.CollectionName); err != nil { |
||||
return fmt.Errorf("[Drop] drop collection failed: %w", err) |
||||
} |
||||
|
||||
if err := m.removeCollection(ctx, req.CollectionName); err != nil { |
||||
logs.CtxWarnf(ctx, "Failed to remove collection record: %v", err) |
||||
} |
||||
|
||||
m.clearCache(req.CollectionName) |
||||
|
||||
logs.CtxInfof(ctx, "Deleted OceanBase collection: %s (table: %s)", req.CollectionName, tableName) |
||||
return nil |
||||
} |
||||
|
||||
func (m *oceanbaseManager) GetType() searchstore.SearchStoreType { |
||||
return searchstore.TypeVectorStore |
||||
} |
||||
|
||||
func (m *oceanbaseManager) GetSearchStore(ctx context.Context, collectionName string) (searchstore.SearchStore, error) { |
||||
if err := ValidateCollectionName(collectionName); err != nil { |
||||
return nil, fmt.Errorf("[GetSearchStore] invalid collection name: %w", err) |
||||
} |
||||
|
||||
if m.config.EnableCache { |
||||
if cached := m.getCachedStore(collectionName); cached != nil { |
||||
return cached, nil |
||||
} |
||||
} |
||||
|
||||
store := &oceanbaseSearchStore{ |
||||
manager: m, |
||||
collectionName: collectionName, |
||||
tableName: TableName(collectionName), |
||||
} |
||||
|
||||
if m.config.EnableCache { |
||||
m.cacheStore(collectionName, store) |
||||
} |
||||
|
||||
return store, nil |
||||
} |
||||
|
||||
func (m *oceanbaseManager) recordCollection(ctx context.Context, collectionName, tableName string) error { |
||||
// Create collections metadata table if not exists
|
||||
createTableSQL := ` |
||||
CREATE TABLE IF NOT EXISTS oceanbase_collections ( |
||||
collection_name VARCHAR(255) PRIMARY KEY, |
||||
table_name VARCHAR(255) NOT NULL, |
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, |
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, |
||||
status ENUM('active', 'deleted') DEFAULT 'active' |
||||
)` |
||||
|
||||
if err := m.config.Client.GetDB().WithContext(ctx).Exec(createTableSQL).Error; err != nil { |
||||
return fmt.Errorf("failed to create collections metadata table: %w", err) |
||||
} |
||||
|
||||
// Insert or update collection record
|
||||
upsertSQL := ` |
||||
INSERT INTO oceanbase_collections (collection_name, table_name, status) |
||||
VALUES (?, ?, 'active') |
||||
ON DUPLICATE KEY UPDATE |
||||
table_name = VALUES(table_name), |
||||
status = 'active', |
||||
updated_at = CURRENT_TIMESTAMP` |
||||
|
||||
if err := m.config.Client.GetDB().WithContext(ctx).Exec(upsertSQL, collectionName, tableName).Error; err != nil { |
||||
return fmt.Errorf("failed to record collection metadata: %w", err) |
||||
} |
||||
|
||||
logs.CtxInfof(ctx, "Recorded collection: %s (table: %s)", collectionName, tableName) |
||||
return nil |
||||
} |
||||
|
||||
func (m *oceanbaseManager) removeCollection(ctx context.Context, collectionName string) error { |
||||
// Soft delete collection record by setting status to 'deleted'
|
||||
updateSQL := ` |
||||
UPDATE oceanbase_collections |
||||
SET status = 'deleted', updated_at = CURRENT_TIMESTAMP |
||||
WHERE collection_name = ?` |
||||
|
||||
if err := m.config.Client.GetDB().WithContext(ctx).Exec(updateSQL, collectionName).Error; err != nil { |
||||
return fmt.Errorf("failed to remove collection metadata: %w", err) |
||||
} |
||||
|
||||
logs.CtxInfof(ctx, "Removed collection record: %s", collectionName) |
||||
return nil |
||||
} |
||||
|
||||
func (m *oceanbaseManager) getCachedStore(collectionName string) searchstore.SearchStore { |
||||
m.mu.RLock() |
||||
defer m.mu.RUnlock() |
||||
|
||||
if cached, exists := m.cache[collectionName]; exists { |
||||
if time.Since(cached.lastUsed) < m.config.CacheTTL { |
||||
cached.lastUsed = time.Now() |
||||
return cached.store |
||||
} |
||||
delete(m.cache, collectionName) |
||||
} |
||||
return nil |
||||
} |
||||
|
||||
func (m *oceanbaseManager) cacheStore(collectionName string, store searchstore.SearchStore) { |
||||
m.mu.Lock() |
||||
defer m.mu.Unlock() |
||||
|
||||
m.cache[collectionName] = &cachedSearchStore{ |
||||
store: store, |
||||
lastUsed: time.Now(), |
||||
} |
||||
} |
||||
|
||||
func (m *oceanbaseManager) clearCache(collectionName string) { |
||||
m.mu.Lock() |
||||
defer m.mu.Unlock() |
||||
|
||||
delete(m.cache, collectionName) |
||||
} |
||||
|
||||
func (m *oceanbaseManager) startCacheCleaner() { |
||||
ticker := time.NewTicker(m.config.CacheTTL / 2) |
||||
defer ticker.Stop() |
||||
|
||||
for range ticker.C { |
||||
m.cleanExpiredCache() |
||||
} |
||||
} |
||||
|
||||
func (m *oceanbaseManager) cleanExpiredCache() { |
||||
m.mu.Lock() |
||||
defer m.mu.Unlock() |
||||
|
||||
now := time.Now() |
||||
for key, cached := range m.cache { |
||||
if now.Sub(cached.lastUsed) > m.config.CacheTTL { |
||||
delete(m.cache, key) |
||||
} |
||||
} |
||||
} |
||||
|
||||
func (m *oceanbaseManager) getVectorDimension() int { |
||||
embeddingType := os.Getenv("EMBEDDING_TYPE") |
||||
|
||||
logs.Infof("[getVectorDimension] EMBEDDING_TYPE: %s", embeddingType) |
||||
|
||||
switch embeddingType { |
||||
case "ark": |
||||
if dimStr := os.Getenv("ARK_EMBEDDING_DIMS"); dimStr != "" { |
||||
if dim, err := strconv.Atoi(dimStr); err == nil { |
||||
return dim |
||||
} |
||||
} |
||||
case "openai": |
||||
if dimStr := os.Getenv("OPENAI_EMBEDDING_DIMS"); dimStr != "" { |
||||
if dim, err := strconv.Atoi(dimStr); err == nil { |
||||
return dim |
||||
} |
||||
} |
||||
case "ollama": |
||||
if dimStr := os.Getenv("OLLAMA_EMBEDDING_DIMS"); dimStr != "" { |
||||
if dim, err := strconv.Atoi(dimStr); err == nil { |
||||
return dim |
||||
} |
||||
} |
||||
case "http": |
||||
if dimStr := os.Getenv("HTTP_EMBEDDING_DIMS"); dimStr != "" { |
||||
if dim, err := strconv.Atoi(dimStr); err == nil { |
||||
return dim |
||||
} |
||||
} |
||||
case "gemini": |
||||
if dimStr := os.Getenv("GEMINI_EMBEDDING_DIMS"); dimStr != "" { |
||||
if dim, err := strconv.Atoi(dimStr); err == nil { |
||||
return dim |
||||
} |
||||
} |
||||
} |
||||
|
||||
if dimStr := os.Getenv("ARK_EMBEDDING_DIMS"); dimStr != "" { |
||||
if dim, err := strconv.Atoi(dimStr); err == nil { |
||||
return dim |
||||
} |
||||
} |
||||
|
||||
if dimStr := os.Getenv("OPENAI_EMBEDDING_DIMS"); dimStr != "" { |
||||
if dim, err := strconv.Atoi(dimStr); err == nil { |
||||
return dim |
||||
} |
||||
} |
||||
|
||||
if dimStr := os.Getenv("OLLAMA_EMBEDDING_DIMS"); dimStr != "" { |
||||
if dim, err := strconv.Atoi(dimStr); err == nil { |
||||
return dim |
||||
} |
||||
} |
||||
|
||||
if dimStr := os.Getenv("HTTP_EMBEDDING_DIMS"); dimStr != "" { |
||||
if dim, err := strconv.Atoi(dimStr); err == nil { |
||||
return dim |
||||
} |
||||
} |
||||
|
||||
if dimStr := os.Getenv("GEMINI_EMBEDDING_DIMS"); dimStr != "" { |
||||
if dim, err := strconv.Atoi(dimStr); err == nil { |
||||
return dim |
||||
} |
||||
} |
||||
|
||||
return 1024 |
||||
} |
@ -0,0 +1,366 @@ |
||||
/* |
||||
* Copyright 2025 coze-dev Authors |
||||
* |
||||
* Licensed under the Apache License, Version 2.0 (the "License"); |
||||
* you may not use this file except in compliance with the License. |
||||
* You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
package oceanbase |
||||
|
||||
import ( |
||||
"context" |
||||
"encoding/json" |
||||
"fmt" |
||||
"sort" |
||||
"time" |
||||
|
||||
"github.com/cloudwego/eino/components/indexer" |
||||
"github.com/cloudwego/eino/components/retriever" |
||||
"github.com/cloudwego/eino/schema" |
||||
|
||||
"github.com/coze-dev/coze-studio/backend/infra/impl/oceanbase" |
||||
"github.com/coze-dev/coze-studio/backend/pkg/lang/ptr" |
||||
"github.com/coze-dev/coze-studio/backend/pkg/logs" |
||||
) |
||||
|
||||
type oceanbaseSearchStore struct { |
||||
manager *oceanbaseManager |
||||
collectionName string |
||||
tableName string |
||||
} |
||||
|
||||
func (s *oceanbaseSearchStore) Store(ctx context.Context, docs []*schema.Document, opts ...indexer.Option) ([]string, error) { |
||||
if len(docs) == 0 { |
||||
return []string{}, nil |
||||
} |
||||
|
||||
startTime := time.Now() |
||||
defer func() { |
||||
logs.CtxInfof(ctx, "Store operation completed in %v for %d documents", |
||||
time.Since(startTime), len(docs)) |
||||
}() |
||||
|
||||
var ids []string |
||||
var vectorDataList []*vectorData |
||||
|
||||
for _, doc := range docs { |
||||
content := ExtractContent(doc) |
||||
if content == "" { |
||||
logs.CtxWarnf(ctx, "Document %s has no content, skipping", doc.ID) |
||||
continue |
||||
} |
||||
|
||||
embeddings, err := s.manager.config.Embedding.EmbedStrings(ctx, []string{content}) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("[Store] failed to embed document: %w", err) |
||||
} |
||||
|
||||
if len(embeddings) == 0 { |
||||
logs.CtxWarnf(ctx, "Failed to generate embedding for document %s", doc.ID) |
||||
continue |
||||
} |
||||
|
||||
metadata := BuildMetadata(doc) |
||||
metadataJSON, err := MetadataToJSON(metadata) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("[Store] failed to marshal metadata: %w", err) |
||||
} |
||||
|
||||
vectorData := &vectorData{ |
||||
VectorID: doc.ID, |
||||
Content: content, |
||||
Metadata: metadataJSON, |
||||
Embedding: ConvertToFloat32(embeddings[0]), |
||||
} |
||||
|
||||
vectorDataList = append(vectorDataList, vectorData) |
||||
ids = append(ids, doc.ID) |
||||
} |
||||
|
||||
if len(vectorDataList) > 0 { |
||||
if err := s.batchInsertWithRetry(ctx, vectorDataList); err != nil { |
||||
return nil, fmt.Errorf("[Store] failed to batch insert vector data: %w", err) |
||||
} |
||||
} |
||||
|
||||
logs.CtxInfof(ctx, "Stored %d documents to OceanBase collection: %s", len(ids), s.collectionName) |
||||
return ids, nil |
||||
} |
||||
|
||||
func (s *oceanbaseSearchStore) Retrieve(ctx context.Context, query string, opts ...retriever.Option) ([]*schema.Document, error) { |
||||
startTime := time.Now() |
||||
defer func() { |
||||
logs.CtxInfof(ctx, "Retrieve operation completed in %v", time.Since(startTime)) |
||||
}() |
||||
|
||||
options := retriever.GetCommonOptions(&retriever.Options{TopK: ptr.Of(10)}, opts...) |
||||
|
||||
embeddings, err := s.manager.config.Embedding.EmbedStrings(ctx, []string{query}) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("[Retrieve] failed to embed query: %w", err) |
||||
} |
||||
|
||||
if len(embeddings) == 0 { |
||||
return nil, fmt.Errorf("[Retrieve] failed to generate embedding for query") |
||||
} |
||||
|
||||
results, err := s.manager.config.Client.SearchVectors( |
||||
ctx, |
||||
s.collectionName, |
||||
embeddings[0], |
||||
ptr.From(options.TopK), |
||||
0.1, |
||||
) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("[Retrieve] failed to search vectors: %w", err) |
||||
} |
||||
|
||||
logs.CtxInfof(ctx, "OceanBase returned %d results", len(results)) |
||||
|
||||
documents := make([]*schema.Document, 0, len(results)) |
||||
for _, result := range results { |
||||
metadata, err := JSONToMetadata(result.Metadata) |
||||
if err != nil { |
||||
logs.CtxWarnf(ctx, "Failed to parse metadata for result %s: %v", result.VectorID, err) |
||||
metadata = make(map[string]interface{}) |
||||
} |
||||
|
||||
doc := &schema.Document{ |
||||
ID: result.VectorID, |
||||
Content: result.Content, |
||||
MetaData: metadata, |
||||
} |
||||
|
||||
similarityScore := result.SimilarityScore |
||||
logs.CtxInfof(ctx, "Setting score for document %s: %f", result.VectorID, similarityScore) |
||||
doc.WithScore(similarityScore) |
||||
|
||||
documents = append(documents, doc) |
||||
} |
||||
|
||||
sort.Slice(documents, func(i, j int) bool { |
||||
return documents[i].Score() > documents[j].Score() |
||||
}) |
||||
|
||||
if len(documents) > 0 { |
||||
s.normalizeScores(documents) |
||||
} |
||||
|
||||
if len(documents) > ptr.From(options.TopK) { |
||||
documents = documents[:ptr.From(options.TopK)] |
||||
} |
||||
|
||||
logs.CtxInfof(ctx, "Retrieved %d documents from OceanBase collection: %s", len(documents), s.collectionName) |
||||
for i, doc := range documents { |
||||
logs.CtxInfof(ctx, "Document %d: ID=%s, Score=%.6f, Content=%s", |
||||
i+1, doc.ID, doc.Score(), doc.Content[:min(len(doc.Content), 50)]) |
||||
} |
||||
|
||||
return documents, nil |
||||
} |
||||
|
||||
func (s *oceanbaseSearchStore) Delete(ctx context.Context, ids []string) error { |
||||
if len(ids) == 0 { |
||||
return nil |
||||
} |
||||
|
||||
startTime := time.Now() |
||||
defer func() { |
||||
logs.CtxInfof(ctx, "Delete operation completed in %v for %d documents", |
||||
time.Since(startTime), len(ids)) |
||||
}() |
||||
|
||||
batchSize := s.manager.config.BatchSize |
||||
for i := 0; i < len(ids); i += batchSize { |
||||
end := i + batchSize |
||||
if end > len(ids) { |
||||
end = len(ids) |
||||
} |
||||
|
||||
batch := ids[i:end] |
||||
if err := s.deleteBatch(ctx, batch); err != nil { |
||||
return fmt.Errorf("[Delete] failed to delete batch: %w", err) |
||||
} |
||||
} |
||||
|
||||
logs.CtxInfof(ctx, "Deleted %d documents from OceanBase collection: %s", len(ids), s.collectionName) |
||||
return nil |
||||
} |
||||
|
||||
func (s *oceanbaseSearchStore) batchInsertWithRetry(ctx context.Context, data []*vectorData) error { |
||||
maxRetries := s.manager.config.MaxRetries |
||||
retryDelay := s.manager.config.RetryDelay |
||||
batchSize := s.manager.config.BatchSize |
||||
|
||||
for attempt := 1; attempt <= maxRetries; attempt++ { |
||||
err := s.batchInsert(ctx, data, batchSize) |
||||
if err == nil { |
||||
return nil |
||||
} else if attempt == maxRetries { |
||||
return fmt.Errorf("batch insert failed after %d attempts: %w", maxRetries, err) |
||||
} |
||||
|
||||
logs.CtxWarnf(ctx, "Batch insert attempt %d failed, retrying in %v: %v", attempt, retryDelay, err) |
||||
time.Sleep(retryDelay) |
||||
} |
||||
|
||||
return nil |
||||
} |
||||
|
||||
func (s *oceanbaseSearchStore) batchInsert(ctx context.Context, data []*vectorData, batchSize int) error { |
||||
var vectors []oceanbase.VectorResult |
||||
for _, item := range data { |
||||
embedding64 := make([]float64, len(item.Embedding)) |
||||
for i, v := range item.Embedding { |
||||
embedding64[i] = float64(v) |
||||
} |
||||
|
||||
var metadata map[string]interface{} |
||||
if item.Metadata != "" && item.Metadata != "{}" { |
||||
if err := json.Unmarshal([]byte(item.Metadata), &metadata); err != nil { |
||||
logs.CtxWarnf(ctx, "Failed to parse metadata for %s: %v", item.VectorID, err) |
||||
metadata = make(map[string]interface{}) |
||||
} |
||||
} else { |
||||
metadata = make(map[string]interface{}) |
||||
} |
||||
|
||||
metadataStr := "{}" |
||||
if len(metadata) > 0 { |
||||
if metadataBytes, err := json.Marshal(metadata); err == nil { |
||||
metadataStr = string(metadataBytes) |
||||
} |
||||
} |
||||
|
||||
vectors = append(vectors, oceanbase.VectorResult{ |
||||
VectorID: item.VectorID, |
||||
Content: item.Content, |
||||
Metadata: metadataStr, |
||||
Embedding: embedding64, |
||||
CreatedAt: time.Now(), |
||||
}) |
||||
} |
||||
|
||||
return s.manager.config.Client.InsertVectors(ctx, s.collectionName, vectors) |
||||
} |
||||
|
||||
func (s *oceanbaseSearchStore) searchVectorsWithRetry(ctx context.Context, queryEmbedding []float32, limit int, threshold float64) ([]*vectorResult, error) { |
||||
maxRetries := s.manager.config.MaxRetries |
||||
retryDelay := s.manager.config.RetryDelay |
||||
|
||||
for attempt := 1; attempt <= maxRetries; attempt++ { |
||||
results, err := s.searchVectors(ctx, queryEmbedding, limit, threshold) |
||||
if err == nil { |
||||
return results, nil |
||||
} |
||||
|
||||
if attempt == maxRetries { |
||||
return nil, fmt.Errorf("search vectors failed after %d attempts: %w", maxRetries, err) |
||||
} |
||||
|
||||
logs.CtxWarnf(ctx, "Search vectors attempt %d failed, retrying in %v: %v", attempt, retryDelay, err) |
||||
time.Sleep(retryDelay) |
||||
} |
||||
|
||||
return nil, nil |
||||
} |
||||
|
||||
func (s *oceanbaseSearchStore) searchVectors(ctx context.Context, queryEmbedding []float32, limit int, threshold float64) ([]*vectorResult, error) { |
||||
embedding64 := make([]float64, len(queryEmbedding)) |
||||
for i, v := range queryEmbedding { |
||||
embedding64[i] = float64(v) |
||||
} |
||||
|
||||
results, err := s.manager.config.Client.SearchVectors(ctx, s.collectionName, embedding64, limit, threshold) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("failed to search vectors: %w", err) |
||||
} |
||||
|
||||
var vectorResults []*vectorResult |
||||
for _, result := range results { |
||||
metadataStr := result.Metadata |
||||
if metadataStr == "" { |
||||
metadataStr = "{}" |
||||
} |
||||
|
||||
vectorResults = append(vectorResults, &vectorResult{ |
||||
VectorID: result.VectorID, |
||||
Content: result.Content, |
||||
Metadata: metadataStr, |
||||
Distance: result.SimilarityScore, |
||||
}) |
||||
} |
||||
|
||||
return vectorResults, nil |
||||
} |
||||
|
||||
func (s *oceanbaseSearchStore) deleteBatch(ctx context.Context, ids []string) error { |
||||
if len(ids) == 0 { |
||||
return nil |
||||
} |
||||
|
||||
for _, id := range ids { |
||||
if err := s.manager.config.Client.DeleteVector(ctx, s.collectionName, id); err != nil { |
||||
return fmt.Errorf("failed to delete vector %s: %w", id, err) |
||||
} |
||||
} |
||||
|
||||
return nil |
||||
} |
||||
|
||||
func (s *oceanbaseSearchStore) normalizeScores(documents []*schema.Document) { |
||||
if len(documents) == 0 { |
||||
return |
||||
} |
||||
|
||||
logs.CtxInfof(context.Background(), "Normalizing scores for %d documents", len(documents)) |
||||
|
||||
|
||||
for i := range documents { |
||||
originalScore := documents[i].Score() |
||||
logs.CtxInfof(context.Background(), "Document %d original score: %f", i+1, originalScore) |
||||
|
||||
if originalScore < 0 { |
||||
documents[i].WithScore(0.0) |
||||
logs.CtxInfof(context.Background(), "Document %d score adjusted from %f to 0.0", i+1, originalScore) |
||||
} else if originalScore > 1 { |
||||
documents[i].WithScore(1.0) |
||||
logs.CtxInfof(context.Background(), "Document %d score adjusted from %f to 1.0", i+1, originalScore) |
||||
} else { |
||||
logs.CtxInfof(context.Background(), "Document %d score unchanged: %f", i+1, originalScore) |
||||
} |
||||
} |
||||
|
||||
logs.CtxInfof(context.Background(), "Score normalization completed") |
||||
} |
||||
|
||||
type vectorData struct { |
||||
VectorID string |
||||
Content string |
||||
Metadata string |
||||
Embedding []float32 |
||||
} |
||||
|
||||
type vectorResult struct { |
||||
ID int64 `json:"id"` |
||||
VectorID string `json:"vector_id"` |
||||
Content string `json:"content"` |
||||
Metadata string `json:"metadata"` |
||||
Distance float64 `json:"distance"` |
||||
} |
||||
|
||||
func min(a, b int) int { |
||||
if a < b { |
||||
return a |
||||
} |
||||
return b |
||||
} |
@ -0,0 +1,81 @@ |
||||
/* |
||||
* Copyright 2025 coze-dev Authors |
||||
* |
||||
* Licensed under the Apache License, Version 2.0 (the "License"); |
||||
* you may not use this file except in compliance with the License. |
||||
* You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
package oceanbase |
||||
|
||||
import ( |
||||
"context" |
||||
"os" |
||||
"strconv" |
||||
|
||||
"github.com/coze-dev/coze-studio/backend/infra/contract/document/searchstore" |
||||
"github.com/coze-dev/coze-studio/backend/infra/contract/embedding" |
||||
"github.com/coze-dev/coze-studio/backend/pkg/logs" |
||||
) |
||||
|
||||
func CreateOceanBaseVectorStore( |
||||
config Config, |
||||
embedding embedding.Embedder, |
||||
) (searchstore.Manager, error) { |
||||
factory := NewFactory(&config) |
||||
|
||||
manager, err := factory.CreateManager(context.Background(), embedding) |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
|
||||
logs.Infof("Successfully created OceanBase vector store with type: %s", searchstore.TypeVectorStore) |
||||
return manager, nil |
||||
} |
||||
|
||||
func CreateOceanBaseVectorStoreWithEnv( |
||||
embedding embedding.Embedder, |
||||
) (searchstore.Manager, error) { |
||||
config := Config{ |
||||
Host: getEnv("OCEANBASE_HOST", "localhost"), |
||||
Port: getEnvAsInt("OCEANBASE_PORT", 2881), |
||||
User: getEnv("OCEANBASE_USER", "root"), |
||||
Password: getEnv("OCEANBASE_PASSWORD", ""), |
||||
Database: getEnv("OCEANBASE_DATABASE", "test"), |
||||
} |
||||
|
||||
return CreateOceanBaseVectorStore(config, embedding) |
||||
} |
||||
|
||||
func getEnv(key, defaultValue string) string { |
||||
if value := os.Getenv(key); value != "" { |
||||
return value |
||||
} |
||||
return defaultValue |
||||
} |
||||
|
||||
func getEnvAsInt(key string, defaultValue int) int { |
||||
if value := os.Getenv(key); value != "" { |
||||
if intValue, err := strconv.Atoi(value); err == nil { |
||||
return intValue |
||||
} |
||||
} |
||||
return defaultValue |
||||
} |
||||
|
||||
func getEnvAsBool(key string, defaultValue bool) bool { |
||||
if value := os.Getenv(key); value != "" { |
||||
if boolValue, err := strconv.ParseBool(value); err == nil { |
||||
return boolValue |
||||
} |
||||
} |
||||
return defaultValue |
||||
} |
@ -0,0 +1,90 @@ |
||||
/* |
||||
* Copyright 2025 coze-dev Authors |
||||
* |
||||
* Licensed under the Apache License, Version 2.0 (the "License"); |
||||
* you may not use this file except in compliance with the License. |
||||
* You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
package oceanbase |
||||
|
||||
import ( |
||||
"context" |
||||
|
||||
"gorm.io/gorm" |
||||
) |
||||
|
||||
type OceanBaseClient struct { |
||||
official *OceanBaseOfficialClient |
||||
} |
||||
|
||||
func NewOceanBaseClient(dsn string) (*OceanBaseClient, error) { |
||||
official, err := NewOceanBaseOfficialClient(dsn) |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
|
||||
return &OceanBaseClient{official: official}, nil |
||||
} |
||||
|
||||
func (c *OceanBaseClient) CreateCollection(ctx context.Context, collectionName string, dimension int) error { |
||||
return c.official.CreateCollection(ctx, collectionName, dimension) |
||||
} |
||||
|
||||
func (c *OceanBaseClient) InsertVectors(ctx context.Context, collectionName string, vectors []VectorResult) error { |
||||
return c.official.InsertVectors(ctx, collectionName, vectors) |
||||
} |
||||
|
||||
func (c *OceanBaseClient) SearchVectors(ctx context.Context, collectionName string, queryVector []float64, topK int, threshold float64) ([]VectorResult, error) { |
||||
return c.official.SearchVectors(ctx, collectionName, queryVector, topK, threshold) |
||||
} |
||||
|
||||
func (c *OceanBaseClient) SearchVectorsWithStrategy(ctx context.Context, collectionName string, queryVector []float64, topK int, threshold float64, strategy SearchStrategy) ([]VectorResult, error) { |
||||
return c.official.SearchVectors(ctx, collectionName, queryVector, topK, threshold) |
||||
} |
||||
|
||||
func (c *OceanBaseClient) GetDB() *gorm.DB { |
||||
return c.official.GetDB() |
||||
} |
||||
|
||||
func (c *OceanBaseClient) DebugCollectionData(ctx context.Context, collectionName string) error { |
||||
return c.official.DebugCollectionData(ctx, collectionName) |
||||
} |
||||
|
||||
func (c *OceanBaseClient) BatchInsertVectors(ctx context.Context, collectionName string, vectors []VectorResult) error { |
||||
return c.official.InsertVectors(ctx, collectionName, vectors) |
||||
} |
||||
|
||||
func (c *OceanBaseClient) DeleteVector(ctx context.Context, collectionName string, vectorID string) error { |
||||
return c.official.GetDB().WithContext(ctx).Exec("DELETE FROM "+collectionName+" WHERE vector_id = ?", vectorID).Error |
||||
} |
||||
|
||||
func (c *OceanBaseClient) InitDatabase(ctx context.Context) error { |
||||
return c.official.GetDB().WithContext(ctx).Exec("SELECT 1").Error |
||||
} |
||||
|
||||
func (c *OceanBaseClient) DropCollection(ctx context.Context, collectionName string) error { |
||||
return c.official.GetDB().WithContext(ctx).Exec("DROP TABLE IF EXISTS " + collectionName).Error |
||||
} |
||||
|
||||
type SearchStrategy interface { |
||||
GetThreshold() float64 |
||||
} |
||||
|
||||
type DefaultSearchStrategy struct{} |
||||
|
||||
func NewDefaultSearchStrategy() *DefaultSearchStrategy { |
||||
return &DefaultSearchStrategy{} |
||||
} |
||||
|
||||
func (s *DefaultSearchStrategy) GetThreshold() float64 { |
||||
return 0.0 |
||||
} |
@ -0,0 +1,373 @@ |
||||
/* |
||||
* Copyright 2025 coze-dev Authors |
||||
* |
||||
* Licensed under the Apache License, Version 2.0 (the "License"); |
||||
* you may not use this file except in compliance with the License. |
||||
* You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
package oceanbase |
||||
|
||||
import ( |
||||
"context" |
||||
"fmt" |
||||
"log" |
||||
"sort" |
||||
"strings" |
||||
"time" |
||||
|
||||
"gorm.io/driver/mysql" |
||||
"gorm.io/gorm" |
||||
"gorm.io/gorm/logger" |
||||
) |
||||
|
||||
type OceanBaseOfficialClient struct { |
||||
db *gorm.DB |
||||
} |
||||
|
||||
type VectorResult struct { |
||||
VectorID string `json:"vector_id"` |
||||
Content string `json:"content"` |
||||
Metadata string `json:"metadata"` |
||||
Embedding []float64 `json:"embedding"` |
||||
SimilarityScore float64 `json:"similarity_score"` |
||||
Distance float64 `json:"distance"` |
||||
CreatedAt time.Time `json:"created_at"` |
||||
} |
||||
|
||||
type CollectionInfo struct { |
||||
Name string `json:"name"` |
||||
Dimension int `json:"dimension"` |
||||
IndexType string `json:"index_type"` |
||||
} |
||||
|
||||
func NewOceanBaseOfficialClient(dsn string) (*OceanBaseOfficialClient, error) { |
||||
db, err := gorm.Open(mysql.Open(dsn), &gorm.Config{ |
||||
Logger: logger.Default.LogMode(logger.Info), |
||||
}) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("failed to connect to OceanBase: %v", err) |
||||
} |
||||
|
||||
client := &OceanBaseOfficialClient{db: db} |
||||
|
||||
if err := client.setVectorParameters(); err != nil { |
||||
log.Printf("Warning: Failed to set vector parameters: %v", err) |
||||
} |
||||
|
||||
return client, nil |
||||
} |
||||
|
||||
func (c *OceanBaseOfficialClient) setVectorParameters() error { |
||||
params := map[string]string{ |
||||
"ob_vector_memory_limit_percentage": "30", |
||||
"ob_query_timeout": "86400000000", |
||||
"max_allowed_packet": "1073741824", |
||||
} |
||||
|
||||
for param, value := range params { |
||||
if err := c.db.Exec(fmt.Sprintf("SET GLOBAL %s = %s", param, value)).Error; err != nil { |
||||
log.Printf("Warning: Failed to set %s: %v", param, err) |
||||
} |
||||
} |
||||
|
||||
return nil |
||||
} |
||||
|
||||
func (c *OceanBaseOfficialClient) CreateCollection(ctx context.Context, collectionName string, dimension int) error { |
||||
createTableSQL := fmt.Sprintf(` |
||||
CREATE TABLE IF NOT EXISTS %s ( |
||||
vector_id VARCHAR(255) PRIMARY KEY, |
||||
content TEXT NOT NULL, |
||||
metadata JSON, |
||||
embedding VECTOR(%d) NOT NULL, |
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, |
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, |
||||
INDEX idx_created_at (created_at), |
||||
INDEX idx_content (content(100)) |
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci |
||||
`, collectionName, dimension) |
||||
|
||||
if err := c.db.WithContext(ctx).Exec(createTableSQL).Error; err != nil { |
||||
return fmt.Errorf("failed to create table: %v", err) |
||||
} |
||||
|
||||
createIndexSQL := fmt.Sprintf(` |
||||
CREATE VECTOR INDEX idx_%s_embedding ON %s(embedding) |
||||
WITH (distance=cosine, type=hnsw, lib=vsag, m=16, ef_construction=200, ef_search=64) |
||||
`, collectionName, collectionName) |
||||
|
||||
if err := c.db.WithContext(ctx).Exec(createIndexSQL).Error; err != nil { |
||||
log.Printf("Warning: Failed to create HNSW vector index, will use exact search: %v", err) |
||||
} |
||||
|
||||
log.Printf("Successfully created collection '%s' with dimension %d", collectionName, dimension) |
||||
return nil |
||||
} |
||||
|
||||
func (c *OceanBaseOfficialClient) InsertVectors(ctx context.Context, collectionName string, vectors []VectorResult) error { |
||||
if len(vectors) == 0 { |
||||
return nil |
||||
} |
||||
|
||||
const batchSize = 100 |
||||
for i := 0; i < len(vectors); i += batchSize { |
||||
end := i + batchSize |
||||
if end > len(vectors) { |
||||
end = len(vectors) |
||||
} |
||||
batch := vectors[i:end] |
||||
|
||||
if err := c.insertBatch(ctx, collectionName, batch); err != nil { |
||||
return fmt.Errorf("failed to insert vectors batch %d-%d: %v", i, end-1, err) |
||||
} |
||||
} |
||||
|
||||
log.Printf("Successfully inserted %d vectors into collection '%s'", len(vectors), collectionName) |
||||
return nil |
||||
} |
||||
|
||||
func (c *OceanBaseOfficialClient) insertBatch(ctx context.Context, collectionName string, batch []VectorResult) error { |
||||
placeholders := make([]string, len(batch)) |
||||
values := make([]interface{}, 0, len(batch)*5) |
||||
|
||||
for j, vector := range batch { |
||||
placeholders[j] = "(?, ?, ?, ?, NOW())" |
||||
values = append(values, |
||||
vector.VectorID, |
||||
vector.Content, |
||||
vector.Metadata, |
||||
c.vectorToString(vector.Embedding), |
||||
) |
||||
} |
||||
|
||||
sql := fmt.Sprintf(` |
||||
INSERT INTO %s (vector_id, content, metadata, embedding, created_at) |
||||
VALUES %s |
||||
ON DUPLICATE KEY UPDATE |
||||
content = VALUES(content), |
||||
metadata = VALUES(metadata), |
||||
embedding = VALUES(embedding), |
||||
updated_at = NOW() |
||||
`, collectionName, strings.Join(placeholders, ",")) |
||||
|
||||
return c.db.WithContext(ctx).Exec(sql, values...).Error |
||||
} |
||||
|
||||
func (c *OceanBaseOfficialClient) SearchVectors( |
||||
ctx context.Context, |
||||
collectionName string, |
||||
queryVector []float64, |
||||
topK int, |
||||
threshold float64, |
||||
) ([]VectorResult, error) { |
||||
|
||||
var count int64 |
||||
if err := c.db.WithContext(ctx).Table(collectionName).Count(&count).Error; err != nil { |
||||
return nil, fmt.Errorf("collection '%s' does not exist: %v", collectionName, err) |
||||
} |
||||
|
||||
if count == 0 { |
||||
log.Printf("Collection '%s' is empty", collectionName) |
||||
return []VectorResult{}, nil |
||||
} |
||||
|
||||
collectionInfo, err := c.getCollectionInfo(ctx, collectionName) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("failed to get collection info: %v", err) |
||||
} |
||||
|
||||
log.Printf("[Debug] Collection info: name=%s, dimension=%d, index_type=%s", |
||||
collectionName, collectionInfo.Dimension, collectionInfo.IndexType) |
||||
|
||||
query, params, err := c.buildOptimizedSearchQuery(collectionName, queryVector, topK) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("failed to build search query: %v", err) |
||||
} |
||||
|
||||
log.Printf("[Debug] Built optimized query: %s", query) |
||||
log.Printf("[Debug] Query params count: %d", len(params)) |
||||
|
||||
var results []VectorResult |
||||
rows, err := c.db.WithContext(ctx).Raw(query, params...).Rows() |
||||
if err != nil { |
||||
return nil, fmt.Errorf("failed to execute search query: %v", err) |
||||
} |
||||
defer rows.Close() |
||||
|
||||
for rows.Next() { |
||||
var result VectorResult |
||||
var embeddingStr string |
||||
if err := rows.Scan( |
||||
&result.VectorID, |
||||
&result.Content, |
||||
&result.Metadata, |
||||
&embeddingStr, |
||||
&result.SimilarityScore, |
||||
&result.CreatedAt, |
||||
); err != nil { |
||||
return nil, fmt.Errorf("failed to scan result row: %v", err) |
||||
} |
||||
results = append(results, result) |
||||
} |
||||
|
||||
log.Printf("[Debug] Raw search results count: %d", len(results)) |
||||
|
||||
finalResults := c.postProcessResults(results, topK, threshold) |
||||
|
||||
log.Printf("[Debug] Final results count: %d", len(finalResults)) |
||||
return finalResults, nil |
||||
} |
||||
|
||||
func (c *OceanBaseOfficialClient) buildOptimizedSearchQuery( |
||||
collectionName string, |
||||
queryVector []float64, |
||||
topK int, |
||||
) (string, []interface{}, error) { |
||||
|
||||
queryVectorStr := c.vectorToString(queryVector) |
||||
|
||||
similarityExpr := "GREATEST(0, LEAST(1, 1 - COSINE_DISTANCE(embedding, ?)))" |
||||
orderBy := "COSINE_DISTANCE(embedding, ?) ASC" |
||||
|
||||
query := fmt.Sprintf(` |
||||
SELECT |
||||
vector_id, |
||||
content, |
||||
metadata, |
||||
embedding, |
||||
%s as similarity_score, |
||||
created_at |
||||
FROM %s |
||||
ORDER BY %s |
||||
APPROXIMATE |
||||
LIMIT %d |
||||
`, similarityExpr, collectionName, orderBy, topK*2) |
||||
|
||||
params := []interface{}{ |
||||
queryVectorStr, |
||||
queryVectorStr, |
||||
} |
||||
|
||||
return query, params, nil |
||||
} |
||||
|
||||
func (c *OceanBaseOfficialClient) getCollectionInfo(ctx context.Context, collectionName string) (*CollectionInfo, error) { |
||||
var dimension int |
||||
|
||||
dimQuery := ` |
||||
SELECT |
||||
SUBSTRING_INDEX(SUBSTRING_INDEX(COLUMN_TYPE, '(', -1), ')', 1) as dimension |
||||
FROM INFORMATION_SCHEMA.COLUMNS |
||||
WHERE TABLE_NAME = ? AND COLUMN_NAME = 'embedding' |
||||
` |
||||
|
||||
if err := c.db.WithContext(ctx).Raw(dimQuery, collectionName).Scan(&dimension).Error; err != nil { |
||||
return nil, fmt.Errorf("failed to get vector dimension: %v", err) |
||||
} |
||||
|
||||
var indexType string |
||||
indexQuery := ` |
||||
SELECT INDEX_TYPE |
||||
FROM INFORMATION_SCHEMA.STATISTICS |
||||
WHERE TABLE_NAME = ? AND INDEX_NAME LIKE 'idx_%_embedding' |
||||
` |
||||
|
||||
if err := c.db.WithContext(ctx).Raw(indexQuery, collectionName).Scan(&indexType).Error; err != nil { |
||||
indexType = "none" |
||||
} |
||||
|
||||
return &CollectionInfo{ |
||||
Name: collectionName, |
||||
Dimension: dimension, |
||||
IndexType: indexType, |
||||
}, nil |
||||
} |
||||
|
||||
func (c *OceanBaseOfficialClient) vectorToString(vector []float64) string { |
||||
if len(vector) == 0 { |
||||
return "[]" |
||||
} |
||||
|
||||
parts := make([]string, len(vector)) |
||||
for i, v := range vector { |
||||
parts[i] = fmt.Sprintf("%.6f", v) |
||||
} |
||||
return "[" + strings.Join(parts, ",") + "]" |
||||
} |
||||
|
||||
func (c *OceanBaseOfficialClient) postProcessResults(results []VectorResult, topK int, threshold float64) []VectorResult { |
||||
if len(results) == 0 { |
||||
return results |
||||
} |
||||
|
||||
filtered := make([]VectorResult, 0, len(results)) |
||||
for _, result := range results { |
||||
if result.SimilarityScore >= threshold { |
||||
filtered = append(filtered, result) |
||||
} |
||||
} |
||||
|
||||
sort.Slice(filtered, func(i, j int) bool { |
||||
return filtered[i].SimilarityScore > filtered[j].SimilarityScore |
||||
}) |
||||
|
||||
if len(filtered) > topK { |
||||
filtered = filtered[:topK] |
||||
} |
||||
|
||||
log.Printf("[Debug] Post-processed results: %d results with threshold %.3f", len(filtered), threshold) |
||||
return filtered |
||||
} |
||||
|
||||
func (c *OceanBaseOfficialClient) GetDB() *gorm.DB { |
||||
return c.db |
||||
} |
||||
|
||||
func (c *OceanBaseOfficialClient) DebugCollectionData(ctx context.Context, collectionName string) error { |
||||
var count int64 |
||||
if err := c.db.WithContext(ctx).Table(collectionName).Count(&count).Error; err != nil { |
||||
log.Printf("[Debug] Collection '%s' does not exist: %v", collectionName, err) |
||||
return err |
||||
} |
||||
log.Printf("[Debug] Collection '%s' exists with %d vectors", collectionName, count) |
||||
|
||||
log.Printf("[Debug] Sample data from collection '%s':", collectionName) |
||||
rows, err := c.db.WithContext(ctx).Raw(` |
||||
SELECT vector_id, content, created_at |
||||
FROM ` + collectionName + ` |
||||
ORDER BY created_at DESC |
||||
LIMIT 5 |
||||
`).Rows() |
||||
if err != nil { |
||||
log.Printf("[Debug] Failed to get sample data: %v", err) |
||||
} else { |
||||
defer rows.Close() |
||||
for rows.Next() { |
||||
var vectorID, content string |
||||
var createdAt time.Time |
||||
if err := rows.Scan(&vectorID, &content, &createdAt); err != nil { |
||||
log.Printf("[Debug] Failed to scan sample row: %v", err) |
||||
continue |
||||
} |
||||
log.Printf("[Debug] Sample: ID=%s, Content=%s, Created=%s", vectorID, content[:min(50, len(content))], createdAt) |
||||
} |
||||
} |
||||
|
||||
return nil |
||||
} |
||||
|
||||
func min(a, b int) int { |
||||
if a < b { |
||||
return a |
||||
} |
||||
return b |
||||
} |
@ -0,0 +1,112 @@ |
||||
/* |
||||
* Copyright 2025 coze-dev Authors |
||||
* |
||||
* Licensed under the Apache License, Version 2.0 (the "License"); |
||||
* you may not use this file except in compliance with the License. |
||||
* You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
package oceanbase |
||||
|
||||
type VectorIndexConfig struct { |
||||
Distance string |
||||
//Index types: hnsw, hnsw_sq, hnsw_bq, ivf_flat, ivf_sq8, ivf_pq
|
||||
Type string |
||||
//Index library type: vsag, ob
|
||||
Lib string |
||||
// HNSW Index parameters
|
||||
M *int |
||||
EfConstruction *int |
||||
EfSearch *int |
||||
// IVF Index parameters
|
||||
Nlist *int |
||||
Nbits *int |
||||
IVFM *int |
||||
} |
||||
|
||||
type VectorData struct { |
||||
ID int64 `json:"id"` |
||||
CollectionName string `json:"collection_name"` |
||||
VectorID string `json:"vector_id"` |
||||
Content string `json:"content"` |
||||
Metadata map[string]interface{} `json:"metadata"` |
||||
Embedding []float64 `json:"embedding"` |
||||
} |
||||
|
||||
type VectorSearchResult struct { |
||||
ID int64 `json:"id"` |
||||
Content string `json:"content"` |
||||
Metadata string `json:"metadata"` |
||||
Distance float64 `json:"distance"` |
||||
} |
||||
|
||||
type VectorMemoryEstimate struct { |
||||
MinMemoryMB int `json:"min_memory_mb"` |
||||
RecommendedMemoryMB int `json:"recommended_memory_mb"` |
||||
EstimatedMemoryMB int `json:"estimated_memory_mb"` |
||||
} |
||||
|
||||
const ( |
||||
VectorIndexTypeHNSW = "hnsw" |
||||
VectorIndexTypeHNSWSQ = "hnsw_sq" |
||||
VectorIndexTypeHNSWBQ = "hnsw_bq" |
||||
VectorIndexTypeIVF = "ivf_flat" |
||||
VectorIndexTypeIVFSQ = "ivf_sq8" |
||||
VectorIndexTypeIVFPQ = "ivf_pq" |
||||
) |
||||
|
||||
const ( |
||||
VectorDistanceTypeL2 = "l2" |
||||
VectorDistanceTypeCosine = "cosine" |
||||
VectorDistanceTypeInnerProduct = "inner_product" |
||||
) |
||||
|
||||
const ( |
||||
VectorLibTypeVSAG = "vsag" |
||||
VectorLibTypeOB = "ob" |
||||
) |
||||
|
||||
func DefaultVectorIndexConfig() *VectorIndexConfig { |
||||
m := 16 |
||||
efConstruction := 200 |
||||
efSearch := 64 |
||||
|
||||
return &VectorIndexConfig{ |
||||
Distance: VectorDistanceTypeCosine, |
||||
Type: VectorIndexTypeHNSW, |
||||
Lib: VectorLibTypeVSAG, |
||||
M: &m, |
||||
EfConstruction: &efConstruction, |
||||
EfSearch: &efSearch, |
||||
} |
||||
} |
||||
|
||||
func HNSWVectorIndexConfig(distance string, m, efConstruction, efSearch int) *VectorIndexConfig { |
||||
return &VectorIndexConfig{ |
||||
Distance: distance, |
||||
Type: VectorIndexTypeHNSW, |
||||
Lib: VectorLibTypeVSAG, |
||||
M: &m, |
||||
EfConstruction: &efConstruction, |
||||
EfSearch: &efSearch, |
||||
} |
||||
} |
||||
|
||||
func IVFVectorIndexConfig(distance string, nlist, nbits, m int) *VectorIndexConfig { |
||||
return &VectorIndexConfig{ |
||||
Distance: distance, |
||||
Type: VectorIndexTypeIVF, |
||||
Lib: VectorLibTypeOB, |
||||
Nlist: &nlist, |
||||
Nbits: &nbits, |
||||
IVFM: &m, |
||||
} |
||||
} |
@ -0,0 +1,380 @@ |
||||
name: coze-studio |
||||
# Environment file will be specified via --env-file parameter |
||||
|
||||
services: |
||||
mysql: |
||||
image: mysql:8.4.5 |
||||
container_name: coze-mysql |
||||
restart: always |
||||
environment: |
||||
MYSQL_ROOT_PASSWORD: ${MYSQL_ROOT_PASSWORD:-root} |
||||
MYSQL_DATABASE: ${MYSQL_DATABASE:-opencoze} |
||||
MYSQL_USER: ${MYSQL_USER:-coze} |
||||
MYSQL_PASSWORD: ${MYSQL_PASSWORD:-coze123} |
||||
# ports: |
||||
# - '3306' |
||||
volumes: |
||||
- ./data/mysql:/var/lib/mysql |
||||
- ./volumes/mysql/schema.sql:/docker-entrypoint-initdb.d/init.sql |
||||
command: |
||||
- --character-set-server=utf8mb4 |
||||
- --collation-server=utf8mb4_unicode_ci |
||||
healthcheck: |
||||
test: |
||||
[ |
||||
'CMD', |
||||
'mysqladmin', |
||||
'ping', |
||||
'-h', |
||||
'localhost', |
||||
'-u$${MYSQL_USER}', |
||||
'-p$${MYSQL_PASSWORD}', |
||||
] |
||||
interval: 10s |
||||
timeout: 5s |
||||
retries: 5 |
||||
start_period: 30s |
||||
networks: |
||||
- coze-network |
||||
|
||||
redis: |
||||
image: bitnami/redis:8.0 |
||||
container_name: coze-redis |
||||
restart: always |
||||
user: root |
||||
privileged: true |
||||
environment: |
||||
- REDIS_AOF_ENABLED=${REDIS_AOF_ENABLED:-no} |
||||
- REDIS_PORT_NUMBER=${REDIS_PORT_NUMBER:-6379} |
||||
- REDIS_IO_THREADS=${REDIS_IO_THREADS:-4} |
||||
- ALLOW_EMPTY_PASSWORD=${ALLOW_EMPTY_PASSWORD:-yes} |
||||
# ports: |
||||
# - '6379' |
||||
volumes: |
||||
- ./data/bitnami/redis:/bitnami/redis/data:rw,Z |
||||
command: > |
||||
bash -c " |
||||
/opt/bitnami/scripts/redis/setup.sh |
||||
# Set proper permissions for data directories |
||||
chown -R redis:redis /bitnami/redis/data |
||||
chmod g+s /bitnami/redis/data |
||||
|
||||
exec /opt/bitnami/scripts/redis/entrypoint.sh /opt/bitnami/scripts/redis/run.sh |
||||
" |
||||
healthcheck: |
||||
test: ['CMD', 'redis-cli', 'ping'] |
||||
interval: 5s |
||||
timeout: 10s |
||||
retries: 10 |
||||
start_period: 10s |
||||
networks: |
||||
- coze-network |
||||
elasticsearch: |
||||
image: bitnami/elasticsearch:8.18.0 |
||||
container_name: coze-elasticsearch |
||||
restart: always |
||||
user: root |
||||
privileged: true |
||||
environment: |
||||
- TEST=1 |
||||
# Add Java certificate trust configuration |
||||
# - ES_JAVA_OPTS=-Djdk.tls.client.protocols=TLSv1.2 -Dhttps.protocols=TLSv1.2 -Djavax.net.ssl.trustAll=true -Xms4096m -Xmx4096m |
||||
# ports: |
||||
# - '9200' |
||||
volumes: |
||||
- ./data/bitnami/elasticsearch:/bitnami/elasticsearch/data |
||||
- ./volumes/elasticsearch/elasticsearch.yml:/opt/bitnami/elasticsearch/config/my_elasticsearch.yml |
||||
- ./volumes/elasticsearch/analysis-smartcn.zip:/opt/bitnami/elasticsearch/analysis-smartcn.zip:rw,Z |
||||
- ./volumes/elasticsearch/setup_es.sh:/setup_es.sh |
||||
- ./volumes/elasticsearch/es_index_schema:/es_index_schema |
||||
healthcheck: |
||||
test: |
||||
[ |
||||
'CMD-SHELL', |
||||
'curl -f http://localhost:9200 && [ -f /tmp/es_plugins_ready ] && [ -f /tmp/es_init_complete ]', |
||||
] |
||||
interval: 5s |
||||
timeout: 10s |
||||
retries: 10 |
||||
start_period: 10s |
||||
networks: |
||||
- coze-network |
||||
# Install smartcn analyzer plugin and initialize ES |
||||
command: > |
||||
bash -c " |
||||
/opt/bitnami/scripts/elasticsearch/setup.sh |
||||
# Set proper permissions for data directories |
||||
chown -R elasticsearch:elasticsearch /bitnami/elasticsearch/data |
||||
chmod g+s /bitnami/elasticsearch/data |
||||
|
||||
# Create plugin directory |
||||
mkdir -p /bitnami/elasticsearch/plugins; |
||||
|
||||
# Unzip plugin to plugin directory and set correct permissions |
||||
echo 'Installing smartcn plugin...'; |
||||
if [ ! -d /opt/bitnami/elasticsearch/plugins/analysis-smartcn ]; then |
||||
|
||||
# Download plugin package locally |
||||
echo 'Copying smartcn plugin...'; |
||||
cp /opt/bitnami/elasticsearch/analysis-smartcn.zip /tmp/analysis-smartcn.zip |
||||
|
||||
elasticsearch-plugin install file:///tmp/analysis-smartcn.zip |
||||
if [[ "$$?" != "0" ]]; then |
||||
echo 'Plugin installation failed, exiting operation'; |
||||
rm -rf /opt/bitnami/elasticsearch/plugins/analysis-smartcn |
||||
exit 1; |
||||
fi; |
||||
rm -f /tmp/analysis-smartcn.zip; |
||||
fi; |
||||
|
||||
# Create marker file indicating plugin installation success |
||||
touch /tmp/es_plugins_ready; |
||||
echo 'Plugin installation successful, marker file created'; |
||||
|
||||
# Start initialization script in background |
||||
( |
||||
echo 'Waiting for Elasticsearch to be ready...' |
||||
until curl -s -f http://localhost:9200/_cat/health >/dev/null 2>&1; do |
||||
echo 'Elasticsearch not ready, waiting...' |
||||
sleep 2 |
||||
done |
||||
echo 'Elasticsearch is ready!' |
||||
|
||||
# Run ES initialization script |
||||
echo 'Running Elasticsearch initialization...' |
||||
sed 's/\r$$//' /setup_es.sh > /setup_es_fixed.sh |
||||
chmod +x /setup_es_fixed.sh |
||||
/setup_es_fixed.sh --index-dir /es_index_schema |
||||
# Create marker file indicating initialization completion |
||||
touch /tmp/es_init_complete |
||||
echo 'Elasticsearch initialization completed successfully!' |
||||
) & |
||||
|
||||
# Start Elasticsearch |
||||
exec /opt/bitnami/scripts/elasticsearch/entrypoint.sh /opt/bitnami/scripts/elasticsearch/run.sh |
||||
echo -e "⏳ Adjusting Elasticsearch disk watermark settings..." |
||||
" |
||||
|
||||
minio: |
||||
image: minio/minio:RELEASE.2025-06-13T11-33-47Z-cpuv1 |
||||
container_name: coze-minio |
||||
user: root |
||||
privileged: true |
||||
restart: always |
||||
# ports: |
||||
# - '9000' |
||||
# - '9001' |
||||
volumes: |
||||
- ./data/minio:/data |
||||
- ./volumes/minio/default_icon/:/default_icon |
||||
- ./volumes/minio/official_plugin_icon/:/official_plugin_icon |
||||
environment: |
||||
MINIO_ROOT_USER: ${MINIO_ROOT_USER:-minioadmin} |
||||
MINIO_ROOT_PASSWORD: ${MINIO_ROOT_PASSWORD:-minioadmin123} |
||||
MINIO_DEFAULT_BUCKETS: ${STORAGE_BUCKET:-opencoze},${MINIO_DEFAULT_BUCKETS:-oceanbase} |
||||
entrypoint: |
||||
- /bin/sh |
||||
- -c |
||||
- | |
||||
# Run initialization in background |
||||
( |
||||
# Wait for MinIO to be ready |
||||
until (/usr/bin/mc alias set localminio http://localhost:9000 $${MINIO_ROOT_USER} $${MINIO_ROOT_PASSWORD}) do |
||||
echo "Waiting for MinIO to be ready..." |
||||
sleep 1 |
||||
done |
||||
|
||||
# Create bucket and copy files |
||||
/usr/bin/mc mb --ignore-existing localminio/$${STORAGE_BUCKET} |
||||
/usr/bin/mc cp --recursive /default_icon/ localminio/$${STORAGE_BUCKET}/default_icon/ |
||||
/usr/bin/mc cp --recursive /official_plugin_icon/ localminio/$${STORAGE_BUCKET}/official_plugin_icon/ |
||||
|
||||
echo "MinIO initialization complete." |
||||
) & |
||||
|
||||
# Start minio server in foreground |
||||
exec minio server /data --console-address ":9001" |
||||
healthcheck: |
||||
test: |
||||
[ |
||||
'CMD-SHELL', |
||||
'/usr/bin/mc alias set health_check http://localhost:9000 ${MINIO_ROOT_USER} ${MINIO_ROOT_PASSWORD} && /usr/bin/mc ready health_check', |
||||
] |
||||
interval: 30s |
||||
timeout: 10s |
||||
retries: 3 |
||||
start_period: 30s |
||||
networks: |
||||
- coze-network |
||||
|
||||
etcd: |
||||
image: bitnami/etcd:3.5 |
||||
container_name: coze-etcd |
||||
user: root |
||||
restart: always |
||||
privileged: true |
||||
environment: |
||||
- ETCD_AUTO_COMPACTION_MODE=revision |
||||
- ETCD_AUTO_COMPACTION_RETENTION=1000 |
||||
- ETCD_QUOTA_BACKEND_BYTES=4294967296 |
||||
- ALLOW_NONE_AUTHENTICATION=yes |
||||
# ports: |
||||
# - '2379' |
||||
# - '2380' |
||||
volumes: |
||||
- ./data/bitnami/etcd:/bitnami/etcd:rw,Z |
||||
- ./volumes/etcd/etcd.conf.yml:/opt/bitnami/etcd/conf/etcd.conf.yml:ro,Z |
||||
command: > |
||||
bash -c " |
||||
/opt/bitnami/scripts/etcd/setup.sh |
||||
# Set proper permissions for data and config directories |
||||
chown -R etcd:etcd /bitnami/etcd |
||||
chmod g+s /bitnami/etcd |
||||
|
||||
exec /opt/bitnami/scripts/etcd/entrypoint.sh /opt/bitnami/scripts/etcd/run.sh |
||||
" |
||||
healthcheck: |
||||
test: ['CMD', 'etcdctl', 'endpoint', 'health'] |
||||
interval: 5s |
||||
timeout: 10s |
||||
retries: 10 |
||||
start_period: 10s |
||||
networks: |
||||
- coze-network |
||||
|
||||
# OceanBase for vector storage |
||||
oceanbase: |
||||
image: oceanbase/oceanbase-ce:latest |
||||
container_name: coze-oceanbase |
||||
restart: always |
||||
environment: |
||||
MODE: SLIM |
||||
OB_DATAFILE_SIZE: 1G |
||||
OB_SYS_PASSWORD: ${OCEANBASE_PASSWORD:-coze123} |
||||
OB_TENANT_PASSWORD: ${OCEANBASE_PASSWORD:-coze123} |
||||
ports: |
||||
- '2881:2881' |
||||
volumes: |
||||
- ./data/oceanbase/ob:/root/ob |
||||
- ./data/oceanbase/cluster:/root/.obd/cluster |
||||
deploy: |
||||
resources: |
||||
limits: |
||||
memory: 4G |
||||
reservations: |
||||
memory: 2G |
||||
healthcheck: |
||||
test: |
||||
[ |
||||
'CMD-SHELL', |
||||
'obclient -h127.0.0.1 -P2881 -uroot@test -pcoze123 -e "SELECT 1;"', |
||||
] |
||||
interval: 10s |
||||
retries: 30 |
||||
start_period: 30s |
||||
timeout: 10s |
||||
networks: |
||||
- coze-network |
||||
nsqlookupd: |
||||
image: nsqio/nsq:v1.2.1 |
||||
container_name: coze-nsqlookupd |
||||
command: /nsqlookupd |
||||
restart: always |
||||
# ports: |
||||
# - '4160' |
||||
# - '4161' |
||||
networks: |
||||
- coze-network |
||||
healthcheck: |
||||
test: ['CMD-SHELL', 'nsqlookupd --version'] |
||||
interval: 5s |
||||
timeout: 10s |
||||
retries: 10 |
||||
start_period: 10s |
||||
|
||||
nsqd: |
||||
image: nsqio/nsq:v1.2.1 |
||||
container_name: coze-nsqd |
||||
command: /nsqd --lookupd-tcp-address=nsqlookupd:4160 --broadcast-address=nsqd |
||||
restart: always |
||||
# ports: |
||||
# - '4150' |
||||
# - '4151' |
||||
depends_on: |
||||
nsqlookupd: |
||||
condition: service_healthy |
||||
networks: |
||||
- coze-network |
||||
healthcheck: |
||||
test: ['CMD-SHELL', '/nsqd --version'] |
||||
interval: 5s |
||||
timeout: 10s |
||||
retries: 10 |
||||
start_period: 10s |
||||
|
||||
nsqadmin: |
||||
image: nsqio/nsq:v1.2.1 |
||||
container_name: coze-nsqadmin |
||||
command: /nsqadmin --lookupd-http-address=nsqlookupd:4161 |
||||
restart: always |
||||
# ports: |
||||
# - '4171' |
||||
depends_on: |
||||
nsqlookupd: |
||||
condition: service_healthy |
||||
networks: |
||||
- coze-network |
||||
|
||||
coze-server: |
||||
# build: |
||||
# context: ../ |
||||
# dockerfile: backend/Dockerfile |
||||
image: cozedev/coze-studio-server:latest |
||||
restart: always |
||||
container_name: coze-server |
||||
# environment: |
||||
# LISTEN_ADDR: 0.0.0.0:8888 |
||||
networks: |
||||
- coze-network |
||||
# ports: |
||||
# - '8888' |
||||
# - '8889' |
||||
volumes: |
||||
- .env:/app/.env |
||||
- ../backend/conf:/app/resources/conf |
||||
# - ../backend/static:/app/resources/static |
||||
depends_on: |
||||
mysql: |
||||
condition: service_healthy |
||||
redis: |
||||
condition: service_healthy |
||||
elasticsearch: |
||||
condition: service_healthy |
||||
minio: |
||||
condition: service_healthy |
||||
oceanbase: |
||||
condition: service_healthy |
||||
command: ['/app/opencoze'] |
||||
|
||||
coze-web: |
||||
# build: |
||||
# context: .. |
||||
# dockerfile: frontend/Dockerfile |
||||
image: cozedev/coze-studio-web:latest |
||||
container_name: coze-web |
||||
restart: always |
||||
ports: |
||||
- "${WEB_LISTEN_ADDR:-8888}:80" |
||||
# - "443:443" # SSL port (uncomment if using SSL) |
||||
volumes: |
||||
- ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro # Main nginx config |
||||
- ./nginx/conf.d/default.conf:/etc/nginx/conf.d/default.conf:ro # Proxy config |
||||
# - ./nginx/ssl:/etc/nginx/ssl:ro # SSL certificates (uncomment if using SSL) |
||||
depends_on: |
||||
- coze-server |
||||
networks: |
||||
- coze-network |
||||
|
||||
networks: |
||||
coze-network: |
||||
driver: bridge |
@ -0,0 +1,529 @@ |
||||
name: coze-studio-debug |
||||
|
||||
x-env-file: &env_file |
||||
- .env.debug |
||||
|
||||
services: |
||||
mysql: |
||||
image: mysql:8.4.5 |
||||
container_name: coze-mysql |
||||
environment: |
||||
MYSQL_ROOT_PASSWORD: ${MYSQL_ROOT_PASSWORD:-root} |
||||
MYSQL_DATABASE: ${MYSQL_DATABASE:-opencoze} |
||||
MYSQL_USER: ${MYSQL_USER:-coze} |
||||
MYSQL_PASSWORD: ${MYSQL_PASSWORD:-coze123} |
||||
profiles: ['middleware', 'mysql-setup', 'mysql'] |
||||
env_file: *env_file |
||||
ports: |
||||
- '3306:3306' |
||||
volumes: |
||||
- ./data/mysql:/var/lib/mysql |
||||
command: |
||||
- --character-set-server=utf8mb4 |
||||
- --collation-server=utf8mb4_unicode_ci |
||||
healthcheck: |
||||
test: |
||||
[ |
||||
'CMD', |
||||
'mysqladmin', |
||||
'ping', |
||||
'-h', |
||||
'localhost', |
||||
'-u$${MYSQL_USER}', |
||||
'-p$${MYSQL_PASSWORD}', |
||||
] |
||||
interval: 10s |
||||
timeout: 5s |
||||
retries: 5 |
||||
start_period: 30s |
||||
networks: |
||||
- coze-network |
||||
|
||||
redis: |
||||
image: bitnami/redis:8.0 |
||||
container_name: coze-redis |
||||
user: root |
||||
privileged: true |
||||
profiles: ['middleware'] |
||||
env_file: *env_file |
||||
environment: |
||||
- REDIS_AOF_ENABLED=${REDIS_AOF_ENABLED:-no} |
||||
- REDIS_PORT_NUMBER=${REDIS_PORT_NUMBER:-6379} |
||||
- REDIS_IO_THREADS=${REDIS_IO_THREADS:-4} |
||||
- ALLOW_EMPTY_PASSWORD=${ALLOW_EMPTY_PASSWORD:-yes} |
||||
ports: |
||||
- '6379:6379' |
||||
volumes: |
||||
- ./data/bitnami/redis:/bitnami/redis/data:rw,Z |
||||
command: > |
||||
bash -c " |
||||
/opt/bitnami/scripts/redis/setup.sh |
||||
# Set proper permissions for data directories |
||||
chown -R redis:redis /bitnami/redis/data |
||||
chmod g+s /bitnami/redis/data |
||||
|
||||
exec /opt/bitnami/scripts/redis/entrypoint.sh /opt/bitnami/scripts/redis/run.sh |
||||
" |
||||
depends_on: |
||||
minio-setup: |
||||
condition: service_completed_successfully |
||||
mysql-setup-schema: |
||||
condition: service_completed_successfully |
||||
mysql-setup-init-sql: |
||||
condition: service_completed_successfully |
||||
healthcheck: |
||||
test: ['CMD', 'redis-cli', 'ping'] |
||||
interval: 5s |
||||
timeout: 10s |
||||
retries: 10 |
||||
start_period: 10s |
||||
networks: |
||||
- coze-network |
||||
|
||||
# rocketmq-namesrv: |
||||
# image: apache/rocketmq:5.3.2 |
||||
# container_name: coze-rocketmq-namesrv |
||||
# privileged: true |
||||
# user: root |
||||
# profiles: ['middleware'] |
||||
# env_file: *env_file |
||||
# ports: |
||||
# - '9876:9876' |
||||
# volumes: |
||||
# - ./data/rocketmq/namesrv/logs:/home/rocketmq/logs:rw,Z |
||||
# - ./data/rocketmq/namesrv/store:/home/rocketmq/store:rw,Z |
||||
# environment: |
||||
# - ALLOW_ANONYMOUS_LOGIN=yes |
||||
# command: > |
||||
# bash -c " |
||||
# # Set proper permissions for data directories |
||||
# mkdir -p /home/rocketmq/logs /home/rocketmq/store |
||||
# mkdir -p /home/rocketmq/logs/rocketmqlogs |
||||
# touch /home/rocketmq/logs/rocketmqlogs/tools.log |
||||
# touch /home/rocketmq/logs/rocketmqlogs/tools_default.log |
||||
|
||||
# chown -R rocketmq:rocketmq /home/rocketmq/logs /home/rocketmq/store |
||||
# chmod g+s /home/rocketmq/logs /home/rocketmq/store |
||||
|
||||
# echo 'Starting RocketMQ NameServer...' |
||||
# sh mqnamesrv |
||||
# " |
||||
# healthcheck: |
||||
# test: ['CMD', 'sh', 'mqadmin', 'clusterList', '-n', 'localhost:9876'] |
||||
# interval: 5s |
||||
# timeout: 10s |
||||
# retries: 10 |
||||
# start_period: 10s |
||||
# networks: |
||||
# - coze-network |
||||
# rocketmq-broker: |
||||
# image: apache/rocketmq:5.3.2 |
||||
# container_name: coze-rocketmq-broker |
||||
# privileged: true |
||||
# user: root |
||||
# profiles: ['middleware'] |
||||
# env_file: *env_file |
||||
# ports: |
||||
# - '10909:10909' |
||||
# - '10911:10911' |
||||
# - '10912:10912' |
||||
# volumes: |
||||
# - ./data/rocketmq/broker/logs:/home/rocketmq/logs:rw,Z |
||||
# - ./data/rocketmq/broker/store:/home/rocketmq/store:rw,Z |
||||
# - ./volumes/rocketmq/broker.conf:/home/rocketmq/conf/broker.conf:rw,Z |
||||
# networks: |
||||
# - coze-network |
||||
# command: > |
||||
# bash -c ' |
||||
# # Set proper permissions |
||||
# mkdir -p /home/rocketmq/logs/rocketmqlogs /home/rocketmq/store |
||||
# touch /home/rocketmq/logs/rocketmqlogs/tools.log \ |
||||
# /home/rocketmq/logs/rocketmqlogs/tools_default.log |
||||
# chown -R rocketmq:rocketmq /home/rocketmq/logs /home/rocketmq/store |
||||
# chmod g+s /home/rocketmq/logs /home/rocketmq/store |
||||
|
||||
# echo "Starting RocketMQ Broker..." |
||||
# sh mqbroker -n rocketmq-namesrv:9876 -c /home/rocketmq/conf/broker.conf & |
||||
|
||||
# echo "Waiting for Broker registration..." |
||||
# broker_ready=false |
||||
# for i in {1..60}; do |
||||
# if sh mqadmin clusterList -n rocketmq-namesrv:9876 \ |
||||
# | grep -q "DefaultCluster.*broker-a"; then |
||||
# echo "Registered." |
||||
# broker_ready=true |
||||
# break |
||||
# fi |
||||
# echo "Not ready, retry $$i/60..." |
||||
# sleep 1 |
||||
# done |
||||
|
||||
# if [ "$$broker_ready" = false ]; then |
||||
# echo "ERROR: registration timed out." |
||||
# exit 1 |
||||
# fi |
||||
|
||||
# echo "Creating topics..." |
||||
# for t in opencoze_knowledge opencoze_search_app opencoze_search_resource \ |
||||
# %RETRY%cg_knowledge %RETRY%cg_search_app %RETRY%cg_search_resource; do |
||||
# sh mqadmin updateTopic -n rocketmq-namesrv:9876 \ |
||||
# -c DefaultCluster -t "$$t" |
||||
# done |
||||
|
||||
# touch /tmp/rocketmq_ready |
||||
# echo "Broker started successfully." |
||||
# wait |
||||
# ' |
||||
# depends_on: |
||||
# - rocketmq-namesrv |
||||
# healthcheck: |
||||
# test: ['CMD-SHELL', '[ -f /tmp/rocketmq_ready ]'] |
||||
# interval: 10s |
||||
# timeout: 10s |
||||
# retries: 10 |
||||
# start_period: 10s |
||||
|
||||
elasticsearch: |
||||
image: bitnami/elasticsearch:8.18.0 |
||||
container_name: coze-elasticsearch |
||||
user: root |
||||
privileged: true |
||||
profiles: ['middleware'] |
||||
env_file: *env_file |
||||
environment: |
||||
- TEST=1 |
||||
# Add Java certificate trust configuration |
||||
# - ES_JAVA_OPTS=-Djdk.tls.client.protocols=TLSv1.2 -Dhttps.protocols=TLSv1.2 -Djavax.net.ssl.trustAll=true -Xms4096m -Xmx4096m |
||||
ports: |
||||
- '9200:9200' |
||||
volumes: |
||||
- ./data/bitnami/elasticsearch:/bitnami/elasticsearch/data |
||||
- ./volumes/elasticsearch/elasticsearch.yml:/opt/bitnami/elasticsearch/config/my_elasticsearch.yml |
||||
- ./volumes/elasticsearch/analysis-smartcn.zip:/opt/bitnami/elasticsearch/analysis-smartcn.zip:rw,Z |
||||
- ./volumes/elasticsearch/setup_es.sh:/setup_es.sh |
||||
- ./volumes/elasticsearch/es_index_schema:/es_index_schemas |
||||
healthcheck: |
||||
test: |
||||
[ |
||||
'CMD-SHELL', |
||||
'curl -f http://localhost:9200 && [ -f /tmp/es_plugins_ready ] && [ -f /tmp/es_init_complete ]', |
||||
] |
||||
interval: 5s |
||||
timeout: 10s |
||||
retries: 10 |
||||
start_period: 10s |
||||
networks: |
||||
- coze-network |
||||
# Install smartcn analyzer plugin and initialize ES |
||||
command: > |
||||
bash -c " |
||||
/opt/bitnami/scripts/elasticsearch/setup.sh |
||||
# Set proper permissions for data directories |
||||
chown -R elasticsearch:elasticsearch /bitnami/elasticsearch/data |
||||
chmod g+s /bitnami/elasticsearch/data |
||||
|
||||
# Create plugin directory |
||||
mkdir -p /bitnami/elasticsearch/plugins; |
||||
|
||||
# Unzip plugin to plugin directory and set correct permissions |
||||
echo 'Installing smartcn plugin...'; |
||||
if [ ! -d /opt/bitnami/elasticsearch/plugins/analysis-smartcn ]; then |
||||
|
||||
# Download plugin package locally |
||||
echo 'Copying smartcn plugin...'; |
||||
cp /opt/bitnami/elasticsearch/analysis-smartcn.zip /tmp/analysis-smartcn.zip |
||||
|
||||
elasticsearch-plugin install file:///tmp/analysis-smartcn.zip |
||||
if [[ "$$?" != "0" ]]; then |
||||
echo 'Plugin installation failed, exiting operation'; |
||||
rm -rf /opt/bitnami/elasticsearch/plugins/analysis-smartcn |
||||
exit 1; |
||||
fi; |
||||
rm -f /tmp/analysis-smartcn.zip; |
||||
fi; |
||||
|
||||
# Create marker file indicating plugin installation success |
||||
touch /tmp/es_plugins_ready; |
||||
echo 'Plugin installation successful, marker file created'; |
||||
|
||||
# Start initialization script in background |
||||
( |
||||
echo 'Waiting for Elasticsearch to be ready...' |
||||
until curl -s -f http://localhost:9200/_cat/health >/dev/null 2>&1; do |
||||
echo 'Elasticsearch not ready, waiting...' |
||||
sleep 2 |
||||
done |
||||
echo 'Elasticsearch is ready!' |
||||
|
||||
# Run ES initialization script |
||||
echo 'Running Elasticsearch initialization...' |
||||
sed 's/\r$$//' /setup_es.sh > /setup_es_fixed.sh |
||||
chmod +x /setup_es_fixed.sh |
||||
/setup_es_fixed.sh --index-dir /es_index_schemas |
||||
# Create marker file indicating initialization completion |
||||
touch /tmp/es_init_complete |
||||
echo 'Elasticsearch initialization completed successfully!' |
||||
) & |
||||
|
||||
# Start Elasticsearch |
||||
exec /opt/bitnami/scripts/elasticsearch/entrypoint.sh /opt/bitnami/scripts/elasticsearch/run.sh |
||||
echo -e "⏳ Adjusting Elasticsearch disk watermark settings..." |
||||
" |
||||
|
||||
minio: |
||||
image: minio/minio:RELEASE.2025-06-13T11-33-47Z-cpuv1 |
||||
container_name: coze-minio |
||||
user: root |
||||
privileged: true |
||||
profiles: ['middleware'] |
||||
env_file: *env_file |
||||
ports: |
||||
- '9000:9000' |
||||
- '9001:9001' |
||||
volumes: |
||||
- ./data/minio:/data |
||||
environment: |
||||
MINIO_ROOT_USER: ${MINIO_ROOT_USER:-minioadmin} |
||||
MINIO_ROOT_PASSWORD: ${MINIO_ROOT_PASSWORD:-minioadmin123} |
||||
MINIO_DEFAULT_BUCKETS: ${MINIO_BUCKET:-opencoze},${MINIO_DEFAULT_BUCKETS:-oceanbase} |
||||
command: server /data --console-address ":9001" |
||||
healthcheck: |
||||
test: |
||||
[ |
||||
'CMD-SHELL', |
||||
'/usr/bin/mc alias set health_check http://localhost:9000 ${MINIO_ROOT_USER} ${MINIO_ROOT_PASSWORD} && /usr/bin/mc ready health_check', |
||||
] |
||||
interval: 30s |
||||
timeout: 10s |
||||
retries: 3 |
||||
start_period: 30s |
||||
networks: |
||||
- coze-network |
||||
|
||||
etcd: |
||||
image: bitnami/etcd:3.5 |
||||
container_name: coze-etcd |
||||
user: root |
||||
privileged: true |
||||
profiles: ['middleware'] |
||||
env_file: *env_file |
||||
environment: |
||||
- ETCD_AUTO_COMPACTION_MODE=revision |
||||
- ETCD_AUTO_COMPACTION_RETENTION=1000 |
||||
- ETCD_QUOTA_BACKEND_BYTES=4294967296 |
||||
- ALLOW_NONE_AUTHENTICATION=yes |
||||
ports: |
||||
- 2379:2379 |
||||
- 2380:2380 |
||||
volumes: |
||||
- ./data/bitnami/etcd:/bitnami/etcd:rw,Z |
||||
- ./volumes/etcd/etcd.conf.yml:/opt/bitnami/etcd/conf/etcd.conf.yml:ro,Z |
||||
command: > |
||||
bash -c " |
||||
/opt/bitnami/scripts/etcd/setup.sh |
||||
# Set proper permissions for data and config directories |
||||
chown -R etcd:etcd /bitnami/etcd |
||||
chmod g+s /bitnami/etcd |
||||
|
||||
exec /opt/bitnami/scripts/etcd/entrypoint.sh /opt/bitnami/scripts/etcd/run.sh |
||||
" |
||||
healthcheck: |
||||
test: ['CMD', 'etcdctl', 'endpoint', 'health'] |
||||
interval: 5s |
||||
timeout: 10s |
||||
retries: 10 |
||||
start_period: 10s |
||||
networks: |
||||
- coze-network |
||||
|
||||
# OceanBase for vector storage |
||||
oceanbase: |
||||
image: oceanbase/oceanbase-ce:latest |
||||
container_name: coze-oceanbase |
||||
environment: |
||||
MODE: SLIM |
||||
OB_DATAFILE_SIZE: 1G |
||||
OB_SYS_PASSWORD: ${OCEANBASE_PASSWORD:-coze123} |
||||
OB_TENANT_PASSWORD: ${OCEANBASE_PASSWORD:-coze123} |
||||
profiles: ['middleware'] |
||||
env_file: *env_file |
||||
ports: |
||||
- '2881:2881' |
||||
volumes: |
||||
- ./data/oceanbase/ob:/root/ob |
||||
- ./data/oceanbase/cluster:/root/.obd/cluster |
||||
deploy: |
||||
resources: |
||||
limits: |
||||
memory: 4G |
||||
reservations: |
||||
memory: 2G |
||||
healthcheck: |
||||
test: |
||||
[ |
||||
'CMD-SHELL', |
||||
'obclient -h127.0.0.1 -P2881 -uroot@test -pcoze123 -e "SELECT 1;"', |
||||
] |
||||
interval: 10s |
||||
retries: 30 |
||||
start_period: 30s |
||||
timeout: 10s |
||||
networks: |
||||
- coze-network |
||||
|
||||
nsqlookupd: |
||||
image: nsqio/nsq:v1.2.1 |
||||
container_name: coze-nsqlookupd |
||||
command: /nsqlookupd |
||||
profiles: ['middleware'] |
||||
ports: |
||||
- '4160:4160' |
||||
- '4161:4161' |
||||
networks: |
||||
- coze-network |
||||
healthcheck: |
||||
test: ['CMD-SHELL', 'nsqlookupd --version'] |
||||
interval: 5s |
||||
timeout: 10s |
||||
retries: 10 |
||||
start_period: 10s |
||||
|
||||
nsqd: |
||||
image: nsqio/nsq:v1.2.1 |
||||
container_name: coze-nsqd |
||||
command: /nsqd --lookupd-tcp-address=coze-nsqlookupd:4160 --broadcast-address=coze-nsqd |
||||
profiles: ['middleware'] |
||||
ports: |
||||
- '4150:4150' |
||||
- '4151:4151' |
||||
depends_on: |
||||
nsqlookupd: |
||||
condition: service_healthy |
||||
networks: |
||||
- coze-network |
||||
healthcheck: |
||||
test: ['CMD-SHELL', '/nsqd --version'] |
||||
interval: 5s |
||||
timeout: 10s |
||||
retries: 10 |
||||
start_period: 10s |
||||
|
||||
nsqadmin: |
||||
image: nsqio/nsq:v1.2.1 |
||||
container_name: coze-nsqadmin |
||||
command: /nsqadmin --lookupd-http-address=coze-nsqlookupd:4161 |
||||
profiles: ['middleware'] |
||||
ports: |
||||
- '4171:4171' |
||||
depends_on: |
||||
nsqlookupd: |
||||
condition: service_healthy |
||||
networks: |
||||
- coze-network |
||||
|
||||
minio-setup: |
||||
image: minio/mc:RELEASE.2025-05-21T01-59-54Z-cpuv1 |
||||
container_name: coze-minio-setup |
||||
profiles: ['middleware'] |
||||
env_file: *env_file |
||||
depends_on: |
||||
minio: |
||||
condition: service_healthy |
||||
volumes: |
||||
- ./volumes/minio/default_icon/:/default_icon |
||||
- ./volumes/minio/official_plugin_icon/:/official_plugin_icon |
||||
entrypoint: > |
||||
/bin/sh -c " |
||||
(/usr/bin/mc alias set localminio http://coze-minio:9000 ${MINIO_ROOT_USER} ${MINIO_ROOT_PASSWORD} && \ |
||||
/usr/bin/mc mb --ignore-existing localminio/${STORAGE_BUCKET} && \ |
||||
/usr/bin/mc cp --recursive /default_icon/ localminio/${STORAGE_BUCKET}/default_icon/ && \ |
||||
/usr/bin/mc cp --recursive /official_plugin_icon/ localminio/${STORAGE_BUCKET}/official_plugin_icon/ && \ |
||||
echo 'upload files to minio complete: Files uploaded to ${STORAGE_BUCKET} bucket.') || exit 1; \ |
||||
" |
||||
networks: |
||||
- coze-network |
||||
restart: 'no' |
||||
|
||||
mysql-setup-schema: |
||||
image: arigaio/atlas:0.35.0-community-alpine |
||||
container_name: coze-mysql-setup-schema |
||||
profiles: ['middleware', 'mysql-setup', 'run-server'] |
||||
env_file: *env_file |
||||
depends_on: |
||||
mysql: |
||||
condition: service_healthy |
||||
volumes: |
||||
- ./atlas/opencoze_latest_schema.hcl:/opencoze_latest_schema.hcl |
||||
entrypoint: |
||||
- /bin/sh |
||||
- -c |
||||
- | |
||||
set -ex |
||||
TMP_ATLAS_URL="${ATLAS_URL}" |
||||
if [ "${MYSQL_HOST}" = "localhost" ] || [ "${MYSQL_HOST}" = "127.0.0.1" ]; then |
||||
echo "MYSQL_HOST is localhost or 127.0.0.1, replacing with docker network address" |
||||
TMP_ATLAS_URL="mysql://${MYSQL_USER}:${MYSQL_PASSWORD}@mysql:${MYSQL_PORT}/${MYSQL_DATABASE}?charset=utf8mb4&parseTime=True" |
||||
fi |
||||
|
||||
echo "final atlas url: $${TMP_ATLAS_URL}" |
||||
for i in `seq 1 60`; do |
||||
if atlas schema apply \ |
||||
-u "$${TMP_ATLAS_URL}" \ |
||||
--to file:///opencoze_latest_schema.hcl \ |
||||
--exclude "atlas_schema_revisions,table_*" \ |
||||
--auto-approve; then |
||||
echo "MySQL setup complete." |
||||
exit 0 |
||||
fi |
||||
echo "atlas schema apply failed, retrying...($$i/60)" |
||||
sleep 1 |
||||
done |
||||
echo "MySQL setup failed after 60 retries." |
||||
exit 1 |
||||
networks: |
||||
- coze-network |
||||
mysql-setup-init-sql: |
||||
image: mysql:8.4.5 |
||||
container_name: coze-mysql-setup-init-sql |
||||
profiles: ['middleware', 'mysql-setup', 'run-server', 'volcano-setup'] |
||||
env_file: *env_file |
||||
depends_on: |
||||
mysql: |
||||
condition: service_healthy |
||||
command: |
||||
- /bin/sh |
||||
- -c |
||||
- | |
||||
set -ex |
||||
for i in $$(seq 1 60); do |
||||
DB_HOST="$${MYSQL_HOST}" |
||||
if [ "$${MYSQL_HOST}" = "localhost" ] || [ "$${MYSQL_HOST}" = "127.0.0.1" ]; then |
||||
DB_HOST="mysql" |
||||
fi |
||||
if mysql -h "$${DB_HOST}" -P"$${MYSQL_PORT}" -u"$${MYSQL_USER}" -p"$${MYSQL_PASSWORD}" "$${MYSQL_DATABASE}" < /schema.sql && \ |
||||
mysql -h "$${DB_HOST}" -P"$${MYSQL_PORT}" -u"$${MYSQL_USER}" -p"$${MYSQL_PASSWORD}" "$${MYSQL_DATABASE}" < /sql_init.sql; then |
||||
echo 'MySQL init success.' |
||||
exit 0 |
||||
fi |
||||
echo "Retrying to connect to mysql... ($$i/60)" |
||||
sleep 1 |
||||
done |
||||
echo 'Failed to init mysql db.' |
||||
exit 1 |
||||
volumes: |
||||
- ./volumes/mysql/sql_init.sql:/sql_init.sql |
||||
- ./volumes/mysql/schema.sql:/schema.sql |
||||
networks: |
||||
- coze-network |
||||
restart: 'no' |
||||
|
||||
coze-server: |
||||
build: |
||||
context: ../ |
||||
dockerfile: backend/Dockerfile |
||||
image: opencoze/opencoze:latest |
||||
profiles: ['build-server'] |
||||
|
||||
networks: |
||||
coze-network: |
||||
driver: bridge |
@ -0,0 +1,362 @@ |
||||
# OceanBase Vector Database Integration Guide |
||||
|
||||
## Overview |
||||
|
||||
This document provides a comprehensive guide to the integration of OceanBase vector database in Coze Studio, including architectural design, implementation details, configuration instructions, and usage guidelines. |
||||
|
||||
## Integration Background |
||||
|
||||
### Why Choose OceanBase? |
||||
|
||||
1. **Transaction Support**: OceanBase provides complete ACID transaction support, ensuring data consistency |
||||
2. **Simple Deployment**: Compared to specialized vector databases like Milvus, OceanBase deployment is simpler |
||||
3. **MySQL Compatibility**: Compatible with MySQL protocol, low learning curve |
||||
4. **Vector Extensions**: Native support for vector data types and indexing |
||||
5. **Operations Friendly**: Low operational costs, suitable for small to medium-scale applications |
||||
|
||||
### Comparison with Milvus |
||||
|
||||
| Feature | OceanBase | Milvus | |
||||
| ------------------------------- | -------------------- | --------------------------- | |
||||
| **Deployment Complexity** | Low (Single Machine) | High (Requires etcd, MinIO) | |
||||
| **Transaction Support** | Full ACID | Limited | |
||||
| **Vector Search Speed** | Medium | Faster | |
||||
| **Storage Efficiency** | Medium | Higher | |
||||
| **Operational Cost** | Low | High | |
||||
| **Learning Curve** | Gentle | Steep | |
||||
|
||||
## Architectural Design |
||||
|
||||
### Overall Architecture |
||||
|
||||
``` |
||||
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ |
||||
│ Coze Studio │ │ OceanBase │ │ Vector Store │ |
||||
│ Application │───▶│ Client │───▶│ Manager │ |
||||
└─────────────────┘ └─────────────────┘ └─────────────────┘ |
||||
│ |
||||
▼ |
||||
┌─────────────────┐ |
||||
│ OceanBase │ |
||||
│ Database │ |
||||
└─────────────────┘ |
||||
``` |
||||
|
||||
### Core Components |
||||
|
||||
#### 1. OceanBase Client (`backend/infra/impl/oceanbase/`) |
||||
|
||||
**Main Files**: |
||||
|
||||
- `oceanbase.go` - Delegation client, providing backward-compatible interface |
||||
- `oceanbase_official.go` - Core implementation, based on official documentation |
||||
- `types.go` - Type definitions |
||||
|
||||
**Core Functions**: |
||||
|
||||
```go |
||||
type OceanBaseClient interface { |
||||
CreateCollection(ctx context.Context, collectionName string) error |
||||
InsertVectors(ctx context.Context, collectionName string, vectors []VectorResult) error |
||||
SearchVectors(ctx context.Context, collectionName string, queryVector []float64, topK int) ([]VectorResult, error) |
||||
DeleteVector(ctx context.Context, collectionName string, vectorID string) error |
||||
InitDatabase(ctx context.Context) error |
||||
DropCollection(ctx context.Context, collectionName string) error |
||||
} |
||||
``` |
||||
|
||||
#### 2. Search Store Manager (`backend/infra/impl/document/searchstore/oceanbase/`) |
||||
|
||||
**Main Files**: |
||||
|
||||
- `oceanbase_manager.go` - Manager implementation |
||||
- `oceanbase_searchstore.go` - Search store implementation |
||||
- `factory.go` - Factory pattern creation |
||||
- `consts.go` - Constant definitions |
||||
- `convert.go` - Data conversion |
||||
- `register.go` - Registration functions |
||||
|
||||
**Core Functions**: |
||||
|
||||
```go |
||||
type Manager interface { |
||||
Create(ctx context.Context, collectionName string) (SearchStore, error) |
||||
Get(ctx context.Context, collectionName string) (SearchStore, error) |
||||
Delete(ctx context.Context, collectionName string) error |
||||
} |
||||
``` |
||||
|
||||
#### 3. Application Layer Integration (`backend/application/base/appinfra/`) |
||||
|
||||
**File**: `app_infra.go` |
||||
|
||||
**Integration Point**: |
||||
|
||||
```go |
||||
case "oceanbase": |
||||
// Build DSN |
||||
dsn := fmt.Sprintf("%s:%s@tcp(%s:%s)/%s?charset=utf8mb4&parseTime=True&loc=Local", |
||||
user, password, host, port, database) |
||||
|
||||
// Create client |
||||
client, err := oceanbaseClient.NewOceanBaseClient(dsn) |
||||
|
||||
// Initialize database |
||||
if err := client.InitDatabase(ctx); err != nil { |
||||
return nil, fmt.Errorf("init oceanbase database failed, err=%w", err) |
||||
} |
||||
``` |
||||
|
||||
## Configuration Instructions |
||||
|
||||
### Environment Variable Configuration |
||||
|
||||
#### Required Configuration |
||||
|
||||
```bash |
||||
# Vector store type |
||||
VECTOR_STORE_TYPE=oceanbase |
||||
|
||||
# OceanBase connection configuration |
||||
OCEANBASE_HOST=localhost |
||||
OCEANBASE_PORT=2881 |
||||
OCEANBASE_USER=root |
||||
OCEANBASE_PASSWORD=coze123 |
||||
OCEANBASE_DATABASE=test |
||||
``` |
||||
|
||||
#### Optional Configuration |
||||
|
||||
```bash |
||||
# Performance optimization configuration |
||||
OCEANBASE_VECTOR_MEMORY_LIMIT_PERCENTAGE=30 |
||||
OCEANBASE_BATCH_SIZE=100 |
||||
OCEANBASE_MAX_OPEN_CONNS=100 |
||||
OCEANBASE_MAX_IDLE_CONNS=10 |
||||
|
||||
# Cache configuration |
||||
OCEANBASE_ENABLE_CACHE=true |
||||
OCEANBASE_CACHE_TTL=300 |
||||
|
||||
# Monitoring configuration |
||||
OCEANBASE_ENABLE_METRICS=true |
||||
OCEANBASE_ENABLE_SLOW_QUERY_LOG=true |
||||
|
||||
# Retry configuration |
||||
OCEANBASE_MAX_RETRIES=3 |
||||
OCEANBASE_RETRY_DELAY=1 |
||||
OCEANBASE_CONN_TIMEOUT=30 |
||||
``` |
||||
|
||||
### Docker Configuration |
||||
|
||||
#### docker-compose-oceanbase.yml |
||||
|
||||
```yaml |
||||
oceanbase: |
||||
image: oceanbase/oceanbase-ce:latest |
||||
container_name: coze-oceanbase |
||||
environment: |
||||
MODE: SLIM |
||||
OB_DATAFILE_SIZE: 1G |
||||
OB_SYS_PASSWORD: ${OCEANBASE_PASSWORD:-coze123} |
||||
OB_TENANT_PASSWORD: ${OCEANBASE_PASSWORD:-coze123} |
||||
ports: |
||||
- '2881:2881' |
||||
volumes: |
||||
- ./data/oceanbase/ob:/root/ob |
||||
- ./data/oceanbase/cluster:/root/.obd/cluster |
||||
deploy: |
||||
resources: |
||||
limits: |
||||
memory: 4G |
||||
reservations: |
||||
memory: 2G |
||||
``` |
||||
|
||||
## Usage Guide |
||||
|
||||
### 1. Quick Start |
||||
|
||||
```bash |
||||
# Clone the project |
||||
git clone https://github.com/coze-dev/coze-studio.git |
||||
cd coze-studio |
||||
|
||||
# Setup OceanBase environment |
||||
make oceanbase_env |
||||
|
||||
# Start OceanBase debug environment |
||||
make oceanbase_debug |
||||
``` |
||||
|
||||
### 2. Verify Deployment |
||||
|
||||
```bash |
||||
# Check container status |
||||
docker ps | grep oceanbase |
||||
|
||||
# Test connection |
||||
mysql -h localhost -P 2881 -u root -p -e "SELECT 1;" |
||||
|
||||
# View databases |
||||
mysql -h localhost -P 2881 -u root -p -e "SHOW DATABASES;" |
||||
``` |
||||
|
||||
### 3. Create Knowledge Base |
||||
|
||||
In the Coze Studio interface: |
||||
|
||||
1. Enter knowledge base management |
||||
2. Select OceanBase as vector storage |
||||
3. Upload documents for vectorization |
||||
4. Test vector retrieval functionality |
||||
|
||||
### 4. Performance Monitoring |
||||
|
||||
```bash |
||||
# View container resource usage |
||||
docker stats coze-oceanbase |
||||
|
||||
# View slow query logs |
||||
docker logs coze-oceanbase | grep "slow query" |
||||
|
||||
# View connection count |
||||
mysql -h localhost -P 2881 -u root -p -e "SHOW PROCESSLIST;" |
||||
``` |
||||
|
||||
## Integration Features |
||||
|
||||
### 1. Design Principles |
||||
|
||||
#### Architecture Compatibility Design |
||||
|
||||
- Strictly follow Coze Studio core architectural design principles, ensuring seamless integration of OceanBase adaptation layer with existing systems |
||||
- Adopt delegation pattern (Delegation Pattern) to achieve backward compatibility, ensuring stability and consistency of existing interfaces |
||||
- Maintain complete compatibility with existing vector storage interfaces, ensuring smooth system migration and upgrade |
||||
|
||||
#### Performance First |
||||
|
||||
- Use HNSW index to achieve efficient approximate nearest neighbor search |
||||
- Batch operations reduce database interaction frequency |
||||
- Connection pool management optimizes resource usage |
||||
|
||||
#### Easy Deployment |
||||
|
||||
- Single machine deployment, no complex cluster configuration required |
||||
- Docker one-click deployment |
||||
- Environment variable configuration, flexible and easy to use |
||||
|
||||
### 2. Technical Highlights |
||||
|
||||
#### Delegation Pattern Design |
||||
|
||||
```go |
||||
type OceanBaseClient struct { |
||||
official *OceanBaseOfficialClient |
||||
} |
||||
|
||||
func (c *OceanBaseClient) CreateCollection(ctx context.Context, collectionName string) error { |
||||
return c.official.CreateCollection(ctx, collectionName) |
||||
} |
||||
``` |
||||
|
||||
#### Intelligent Configuration Management |
||||
|
||||
```go |
||||
func DefaultConfig() *Config { |
||||
return &Config{ |
||||
Host: getEnv("OCEANBASE_HOST", "localhost"), |
||||
Port: getEnvAsInt("OCEANBASE_PORT", 2881), |
||||
User: getEnv("OCEANBASE_USER", "root"), |
||||
Password: getEnv("OCEANBASE_PASSWORD", ""), |
||||
Database: getEnv("OCEANBASE_DATABASE", "test"), |
||||
// ... other configurations |
||||
} |
||||
} |
||||
``` |
||||
|
||||
#### Error Handling Optimization |
||||
|
||||
```go |
||||
func (c *OceanBaseOfficialClient) setVectorParameters() error { |
||||
params := map[string]string{ |
||||
"ob_vector_memory_limit_percentage": "30", |
||||
"ob_query_timeout": "86400000000", |
||||
"max_allowed_packet": "1073741824", |
||||
} |
||||
|
||||
for param, value := range params { |
||||
if err := c.db.Exec(fmt.Sprintf("SET GLOBAL %s = %s", param, value)).Error; err != nil { |
||||
log.Printf("Warning: Failed to set %s: %v", param, err) |
||||
} |
||||
} |
||||
return nil |
||||
} |
||||
``` |
||||
|
||||
## Troubleshooting |
||||
|
||||
### 1. Common Issues |
||||
|
||||
#### Connection Issues |
||||
|
||||
```bash |
||||
# Check container status |
||||
docker ps | grep oceanbase |
||||
|
||||
# Check port mapping |
||||
docker port coze-oceanbase |
||||
|
||||
# Test connection |
||||
mysql -h localhost -P 2881 -u root -p -e "SELECT 1;" |
||||
``` |
||||
|
||||
#### Vector Index Issues |
||||
|
||||
```sql |
||||
-- Check index status |
||||
SHOW INDEX FROM test_vectors; |
||||
|
||||
-- Rebuild index |
||||
DROP INDEX idx_test_embedding ON test_vectors; |
||||
CREATE VECTOR INDEX idx_test_embedding ON test_vectors(embedding) |
||||
WITH (distance=cosine, type=hnsw, lib=vsag, m=16, ef_construction=200, ef_search=64); |
||||
``` |
||||
|
||||
#### Performance Issues |
||||
|
||||
```sql |
||||
-- Adjust memory limit |
||||
SET GLOBAL ob_vector_memory_limit_percentage = 50; |
||||
|
||||
-- View slow queries |
||||
SHOW VARIABLES LIKE 'slow_query_log'; |
||||
``` |
||||
|
||||
### 2. Log Analysis |
||||
|
||||
```bash |
||||
# View OceanBase logs |
||||
docker logs coze-oceanbase |
||||
|
||||
# View application logs |
||||
tail -f logs/coze-studio.log | grep -i "oceanbase\|vector" |
||||
``` |
||||
|
||||
## Summary |
||||
|
||||
The integration of OceanBase vector database in Coze Studio has achieved the following goals: |
||||
|
||||
1. **Complete Functionality**: Supports complete vector storage and retrieval functionality |
||||
2. **Good Performance**: Achieves efficient vector search through HNSW indexing |
||||
3. **Simple Deployment**: Single machine deployment, no complex configuration required |
||||
4. **Operations Friendly**: Low operational costs, easy monitoring and management |
||||
5. **Strong Scalability**: Supports horizontal and vertical scaling |
||||
|
||||
Through this integration, Coze Studio provides users with a simple, efficient, and reliable vector database solution, particularly suitable for scenarios requiring transaction support, simple deployment, and low operational costs. |
||||
|
||||
## Related Links |
||||
|
||||
- [OceanBase Official Documentation](https://www.oceanbase.com/docs) |
||||
- [Coze Studio Project Repository](https://github.com/coze-dev/coze-studio) |
@ -0,0 +1,364 @@ |
||||
# OceanBase 向量数据库集成指南 |
||||
|
||||
## 概述 |
||||
|
||||
本文档详细介绍了 OceanBase 向量数据库在 Coze Studio 中的集成适配情况,包括架构设计、实现细节、配置说明和使用指南。 |
||||
|
||||
## 集成背景 |
||||
|
||||
### 为什么选择 OceanBase? |
||||
|
||||
1. **事务支持**: OceanBase 提供完整的 ACID 事务支持,确保数据一致性 |
||||
2. **部署简单**: 相比 Milvus 等专用向量数据库,OceanBase 部署更简单 |
||||
3. **MySQL 兼容**: 兼容 MySQL 协议,学习成本低 |
||||
4. **向量扩展**: 原生支持向量数据类型和索引 |
||||
5. **运维友好**: 运维成本低,适合中小规模应用 |
||||
|
||||
### 与 Milvus 的对比 |
||||
|
||||
| 特性 | OceanBase | Milvus | |
||||
| ---------------------- | -------------- | ---------------------- | |
||||
| **部署复杂度** | 低(单机部署) | 高(需要 etcd、MinIO) | |
||||
| **事务支持** | 完整 ACID | 有限 | |
||||
| **向量检索速度** | 中等 | 更快 | |
||||
| **存储效率** | 中等 | 更高 | |
||||
| **运维成本** | 低 | 高 | |
||||
| **学习曲线** | 平缓 | 陡峭 | |
||||
|
||||
## 架构设计 |
||||
|
||||
### 整体架构 |
||||
|
||||
``` |
||||
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ |
||||
│ Coze Studio │ │ OceanBase │ │ Vector Store │ |
||||
│ Application │───▶│ Client │───▶│ Manager │ |
||||
└─────────────────┘ └─────────────────┘ └─────────────────┘ |
||||
│ |
||||
▼ |
||||
┌─────────────────┐ |
||||
│ OceanBase │ |
||||
│ Database │ |
||||
└─────────────────┘ |
||||
``` |
||||
|
||||
### 核心组件 |
||||
|
||||
#### 1. OceanBase Client (`backend/infra/impl/oceanbase/`) |
||||
|
||||
**主要文件**: |
||||
|
||||
- `oceanbase.go` - 委托客户端,提供向后兼容接口 |
||||
- `oceanbase_official.go` - 核心实现,基于官方文档 |
||||
- `types.go` - 类型定义 |
||||
|
||||
**核心功能**: |
||||
|
||||
```go |
||||
type OceanBaseClient interface { |
||||
CreateCollection(ctx context.Context, collectionName string) error |
||||
InsertVectors(ctx context.Context, collectionName string, vectors []VectorResult) error |
||||
SearchVectors(ctx context.Context, collectionName string, queryVector []float64, topK int) ([]VectorResult, error) |
||||
DeleteVector(ctx context.Context, collectionName string, vectorID string) error |
||||
InitDatabase(ctx context.Context) error |
||||
DropCollection(ctx context.Context, collectionName string) error |
||||
} |
||||
``` |
||||
|
||||
#### 2. Search Store Manager (`backend/infra/impl/document/searchstore/oceanbase/`) |
||||
|
||||
**主要文件**: |
||||
|
||||
- `oceanbase_manager.go` - 管理器实现 |
||||
- `oceanbase_searchstore.go` - 搜索存储实现 |
||||
- `factory.go` - 工厂模式创建 |
||||
- `consts.go` - 常量定义 |
||||
- `convert.go` - 数据转换 |
||||
- `register.go` - 注册函数 |
||||
|
||||
**核心功能**: |
||||
|
||||
```go |
||||
type Manager interface { |
||||
Create(ctx context.Context, collectionName string) (SearchStore, error) |
||||
Get(ctx context.Context, collectionName string) (SearchStore, error) |
||||
Delete(ctx context.Context, collectionName string) error |
||||
} |
||||
``` |
||||
|
||||
#### 3. 应用层集成 (`backend/application/base/appinfra/`) |
||||
|
||||
**文件**: `app_infra.go` |
||||
|
||||
**集成点**: |
||||
|
||||
```go |
||||
case "oceanbase": |
||||
// 构建 DSN |
||||
dsn := fmt.Sprintf("%s:%s@tcp(%s:%s)/%s?charset=utf8mb4&parseTime=True&loc=Local", |
||||
user, password, host, port, database) |
||||
|
||||
// 创建客户端 |
||||
client, err := oceanbaseClient.NewOceanBaseClient(dsn) |
||||
|
||||
// 初始化数据库 |
||||
if err := client.InitDatabase(ctx); err != nil { |
||||
return nil, fmt.Errorf("init oceanbase database failed, err=%w", err) |
||||
} |
||||
``` |
||||
|
||||
|
||||
|
||||
## 配置说明 |
||||
|
||||
### 环境变量配置 |
||||
|
||||
#### 必需配置 |
||||
|
||||
```bash |
||||
# 向量存储类型 |
||||
VECTOR_STORE_TYPE=oceanbase |
||||
|
||||
# OceanBase 连接配置 |
||||
OCEANBASE_HOST=localhost |
||||
OCEANBASE_PORT=2881 |
||||
OCEANBASE_USER=root |
||||
OCEANBASE_PASSWORD=coze123 |
||||
OCEANBASE_DATABASE=test |
||||
``` |
||||
|
||||
#### 可选配置 |
||||
|
||||
```bash |
||||
# 性能优化配置 |
||||
OCEANBASE_VECTOR_MEMORY_LIMIT_PERCENTAGE=30 |
||||
OCEANBASE_BATCH_SIZE=100 |
||||
OCEANBASE_MAX_OPEN_CONNS=100 |
||||
OCEANBASE_MAX_IDLE_CONNS=10 |
||||
|
||||
# 缓存配置 |
||||
OCEANBASE_ENABLE_CACHE=true |
||||
OCEANBASE_CACHE_TTL=300 |
||||
|
||||
# 监控配置 |
||||
OCEANBASE_ENABLE_METRICS=true |
||||
OCEANBASE_ENABLE_SLOW_QUERY_LOG=true |
||||
|
||||
# 重试配置 |
||||
OCEANBASE_MAX_RETRIES=3 |
||||
OCEANBASE_RETRY_DELAY=1 |
||||
OCEANBASE_CONN_TIMEOUT=30 |
||||
``` |
||||
|
||||
### Docker 配置 |
||||
|
||||
#### docker-compose-oceanbase.yml |
||||
|
||||
```yaml |
||||
oceanbase: |
||||
image: oceanbase/oceanbase-ce:latest |
||||
container_name: coze-oceanbase |
||||
environment: |
||||
MODE: SLIM |
||||
OB_DATAFILE_SIZE: 1G |
||||
OB_SYS_PASSWORD: ${OCEANBASE_PASSWORD:-coze123} |
||||
OB_TENANT_PASSWORD: ${OCEANBASE_PASSWORD:-coze123} |
||||
ports: |
||||
- '2881:2881' |
||||
volumes: |
||||
- ./data/oceanbase/ob:/root/ob |
||||
- ./data/oceanbase/cluster:/root/.obd/cluster |
||||
deploy: |
||||
resources: |
||||
limits: |
||||
memory: 4G |
||||
reservations: |
||||
memory: 2G |
||||
``` |
||||
|
||||
## 使用指南 |
||||
|
||||
### 1. 快速启动 |
||||
|
||||
```bash |
||||
# 克隆项目 |
||||
git clone https://github.com/coze-dev/coze-studio.git |
||||
cd coze-studio |
||||
|
||||
# 设置 OceanBase 环境文件 |
||||
make oceanbase_env |
||||
|
||||
# 启动 OceanBase 调试环境 |
||||
make oceanbase_debug |
||||
``` |
||||
|
||||
### 2. 验证部署 |
||||
|
||||
```bash |
||||
# 检查容器状态 |
||||
docker ps | grep oceanbase |
||||
|
||||
# 测试连接 |
||||
mysql -h localhost -P 2881 -u root -p -e "SELECT 1;" |
||||
|
||||
# 查看数据库 |
||||
mysql -h localhost -P 2881 -u root -p -e "SHOW DATABASES;" |
||||
``` |
||||
|
||||
### 3. 创建知识库 |
||||
|
||||
在 Coze Studio 界面中: |
||||
|
||||
1. 进入知识库管理 |
||||
2. 选择 OceanBase 作为向量存储 |
||||
3. 上传文档进行向量化 |
||||
4. 测试向量检索功能 |
||||
|
||||
### 4. 性能监控 |
||||
|
||||
```bash |
||||
# 查看容器资源使用 |
||||
docker stats coze-oceanbase |
||||
|
||||
# 查看慢查询日志 |
||||
docker logs coze-oceanbase | grep "slow query" |
||||
|
||||
# 查看连接数 |
||||
mysql -h localhost -P 2881 -u root -p -e "SHOW PROCESSLIST;" |
||||
``` |
||||
|
||||
## 适配特点 |
||||
|
||||
### 1. 设计原则 |
||||
|
||||
#### 架构兼容性设计 |
||||
|
||||
- 严格遵循 Coze Studio 核心架构设计原则,确保 OceanBase 适配层与现有系统无缝集成 |
||||
- 采用委托模式(Delegation Pattern)实现向后兼容,保证现有接口的稳定性和一致性 |
||||
- 保持与现有向量存储接口的完全兼容,确保系统平滑迁移和升级 |
||||
|
||||
#### 性能优先 |
||||
|
||||
- 使用 HNSW 索引实现高效的近似最近邻搜索 |
||||
- 批量操作减少数据库交互次数 |
||||
- 连接池管理优化资源使用 |
||||
|
||||
#### 易于部署 |
||||
|
||||
- 单机部署,无需复杂的集群配置 |
||||
- Docker 一键部署 |
||||
- 环境变量配置,灵活易用 |
||||
|
||||
### 2. 技术亮点 |
||||
|
||||
#### 委托模式设计 |
||||
|
||||
```go |
||||
type OceanBaseClient struct { |
||||
official *OceanBaseOfficialClient |
||||
} |
||||
|
||||
func (c *OceanBaseClient) CreateCollection(ctx context.Context, collectionName string) error { |
||||
return c.official.CreateCollection(ctx, collectionName) |
||||
} |
||||
``` |
||||
|
||||
#### 智能配置管理 |
||||
|
||||
```go |
||||
func DefaultConfig() *Config { |
||||
return &Config{ |
||||
Host: getEnv("OCEANBASE_HOST", "localhost"), |
||||
Port: getEnvAsInt("OCEANBASE_PORT", 2881), |
||||
User: getEnv("OCEANBASE_USER", "root"), |
||||
Password: getEnv("OCEANBASE_PASSWORD", ""), |
||||
Database: getEnv("OCEANBASE_DATABASE", "test"), |
||||
// ... 其他配置 |
||||
} |
||||
} |
||||
``` |
||||
|
||||
#### 错误处理优化 |
||||
|
||||
```go |
||||
func (c *OceanBaseOfficialClient) setVectorParameters() error { |
||||
params := map[string]string{ |
||||
"ob_vector_memory_limit_percentage": "30", |
||||
"ob_query_timeout": "86400000000", |
||||
"max_allowed_packet": "1073741824", |
||||
} |
||||
|
||||
for param, value := range params { |
||||
if err := c.db.Exec(fmt.Sprintf("SET GLOBAL %s = %s", param, value)).Error; err != nil { |
||||
log.Printf("Warning: Failed to set %s: %v", param, err) |
||||
} |
||||
} |
||||
return nil |
||||
} |
||||
``` |
||||
|
||||
## 故障排查 |
||||
|
||||
### 1. 常见问题 |
||||
|
||||
#### 连接问题 |
||||
|
||||
```bash |
||||
# 检查容器状态 |
||||
docker ps | grep oceanbase |
||||
|
||||
# 检查端口映射 |
||||
docker port coze-oceanbase |
||||
|
||||
# 测试连接 |
||||
mysql -h localhost -P 2881 -u root -p -e "SELECT 1;" |
||||
``` |
||||
|
||||
#### 向量索引问题 |
||||
|
||||
```sql |
||||
-- 检查索引状态 |
||||
SHOW INDEX FROM test_vectors; |
||||
|
||||
-- 重建索引 |
||||
DROP INDEX idx_test_embedding ON test_vectors; |
||||
CREATE VECTOR INDEX idx_test_embedding ON test_vectors(embedding) |
||||
WITH (distance=cosine, type=hnsw, lib=vsag, m=16, ef_construction=200, ef_search=64); |
||||
``` |
||||
|
||||
#### 性能问题 |
||||
|
||||
```sql |
||||
-- 调整内存限制 |
||||
SET GLOBAL ob_vector_memory_limit_percentage = 50; |
||||
|
||||
-- 查看慢查询 |
||||
SHOW VARIABLES LIKE 'slow_query_log'; |
||||
``` |
||||
|
||||
### 2. 日志分析 |
||||
|
||||
```bash |
||||
# 查看 OceanBase 日志 |
||||
docker logs coze-oceanbase |
||||
|
||||
# 查看应用日志 |
||||
tail -f logs/coze-studio.log | grep -i "oceanbase\|vector" |
||||
``` |
||||
|
||||
## 总结 |
||||
|
||||
OceanBase 向量数据库在 Coze Studio 中的集成实现了以下目标: |
||||
|
||||
1. **功能完整**: 支持完整的向量存储和检索功能 |
||||
2. **性能良好**: 通过 HNSW 索引实现高效的向量搜索 |
||||
3. **部署简单**: 单机部署,无需复杂配置 |
||||
4. **运维友好**: 低运维成本,易于监控和管理 |
||||
5. **扩展性强**: 支持水平扩展和垂直扩展 |
||||
|
||||
通过这次集成,Coze Studio 为用户提供了一个简单、高效、可靠的向量数据库解决方案,特别适合需要事务支持、部署简单、运维成本低的场景。 |
||||
|
||||
## 相关链接 |
||||
|
||||
- [OceanBase 官方文档](https://www.oceanbase.com/docs) |
||||
- [Coze Studio 项目地址](https://github.com/coze-dev/coze-studio) |
@ -0,0 +1,88 @@ |
||||
#!/bin/bash |
||||
# |
||||
# Copyright 2025 coze-dev Authors |
||||
# |
||||
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||
# you may not use this file except in compliance with the License. |
||||
# You may obtain a copy of the License at |
||||
# |
||||
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
# |
||||
# Unless required by applicable law or agreed to in writing, software |
||||
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
# See the License for the specific language governing permissions and |
||||
# limitations under the License. |
||||
# |
||||
|
||||
|
||||
# OceanBase Environment Configuration Script |
||||
# Dynamically modify vector store type in environment files |
||||
|
||||
set -e |
||||
|
||||
# Colors |
||||
RED='\033[0;31m' |
||||
GREEN='\033[0;32m' |
||||
YELLOW='\033[1;33m' |
||||
NC='\033[0m' |
||||
|
||||
# Script directory |
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" |
||||
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")" |
||||
DOCKER_DIR="$PROJECT_ROOT/../docker" |
||||
|
||||
# Environment type |
||||
ENV_TYPE="${1:-debug}" |
||||
|
||||
# Validate environment type |
||||
if [[ "$ENV_TYPE" != "debug" && "$ENV_TYPE" != "env" ]]; then |
||||
echo -e "${RED}Error: Invalid environment type '$ENV_TYPE'${NC}" |
||||
echo "Usage: $0 [debug|env]" |
||||
echo " debug - Test environment (.env.debug)" |
||||
echo " env - Production environment (.env)" |
||||
exit 1 |
||||
fi |
||||
|
||||
# Determine target environment file |
||||
if [[ "$ENV_TYPE" == "debug" ]]; then |
||||
TARGET_ENV_FILE="$DOCKER_DIR/.env.debug" |
||||
else |
||||
TARGET_ENV_FILE="$DOCKER_DIR/.env" |
||||
fi |
||||
|
||||
# Check if target environment file exists |
||||
if [[ ! -f "$TARGET_ENV_FILE" ]]; then |
||||
if [[ "$ENV_TYPE" == "debug" ]]; then |
||||
cp "$DOCKER_DIR/.env.debug.example" "$TARGET_ENV_FILE" |
||||
else |
||||
cp "$DOCKER_DIR/.env.example" "$TARGET_ENV_FILE" |
||||
fi |
||||
fi |
||||
|
||||
# Check if already configured for OceanBase |
||||
if grep -q "VECTOR_STORE_TYPE.*oceanbase" "$TARGET_ENV_FILE"; then |
||||
echo -e "${YELLOW}Already configured for OceanBase${NC}" |
||||
else |
||||
echo -e "${GREEN}Configuring OceanBase...${NC}" |
||||
|
||||
# Use sed to replace VECTOR_STORE_TYPE |
||||
if [[ "$OSTYPE" == "darwin"* ]]; then |
||||
# macOS requires special handling - use temporary file to avoid .bak creation |
||||
sed "s/export VECTOR_STORE_TYPE=\"milvus\"/export VECTOR_STORE_TYPE=\"oceanbase\"/g" "$TARGET_ENV_FILE" > "$TARGET_ENV_FILE.tmp" |
||||
sed "s/export VECTOR_STORE_TYPE=\"vikingdb\"/export VECTOR_STORE_TYPE=\"oceanbase\"/g" "$TARGET_ENV_FILE.tmp" > "$TARGET_ENV_FILE" |
||||
rm -f "$TARGET_ENV_FILE.tmp" |
||||
else |
||||
# Linux systems |
||||
sed -i "s/export VECTOR_STORE_TYPE=\"milvus\"/export VECTOR_STORE_TYPE=\"oceanbase\"/g" "$TARGET_ENV_FILE" |
||||
sed -i "s/export VECTOR_STORE_TYPE=\"vikingdb\"/export VECTOR_STORE_TYPE=\"oceanbase\"/g" "$TARGET_ENV_FILE" |
||||
fi |
||||
fi |
||||
|
||||
# Verify configuration |
||||
if grep -q "VECTOR_STORE_TYPE.*oceanbase" "$TARGET_ENV_FILE"; then |
||||
echo -e "${GREEN}✅ OceanBase configured successfully${NC}" |
||||
else |
||||
echo -e "${RED}❌ Failed to configure OceanBase${NC}" |
||||
exit 1 |
||||
fi |
Loading…
Reference in new issue