Merge pull request #5 from foomo/feature/document-converter

feat: add document converter and option to remove documents from indexing
This commit is contained in:
Daniel Thomas 2025-05-21 14:56:37 +02:00 committed by GitHub
commit f18489c623
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 129 additions and 99 deletions

View File

@ -1,4 +1,4 @@
hooks:
hooks:
pre-commit:
- golangci-lint run --fast-only
- husky lint-staged

View File

@ -14,28 +14,30 @@ import (
"go.uber.org/zap"
)
const defaultSearchPresetName = "default"
type DocumentConverter[indexDocument any, returnType any] func(indexDocument) returnType
type BaseAPI[indexDocument any, returnType any] struct {
l *zap.Logger
client *typesense.Client
collections map[pkgx.IndexID]*api.CollectionSchema
preset *api.PresetUpsertSchema
revisionID pkgx.RevisionID
l *zap.Logger
client *typesense.Client
collections map[pkgx.IndexID]*api.CollectionSchema
presets map[string]*api.PresetUpsertSchema
revisionID pkgx.RevisionID
documentConverter DocumentConverter[indexDocument, returnType]
}
func NewBaseAPI[indexDocument any, returnType any](
l *zap.Logger,
client *typesense.Client,
collections map[pkgx.IndexID]*api.CollectionSchema,
preset *api.PresetUpsertSchema,
presets map[string]*api.PresetUpsertSchema,
documentConverter DocumentConverter[indexDocument, returnType],
) *BaseAPI[indexDocument, returnType] {
return &BaseAPI[indexDocument, returnType]{
l: l,
client: client,
collections: collections,
preset: preset,
l: l,
client: client,
collections: collections,
presets: presets,
documentConverter: documentConverter,
}
}
@ -149,11 +151,11 @@ func (b *BaseAPI[indexDocument, returnType]) Initialize(ctx context.Context) (pk
// Step 5: Set the latest revision ID and return
b.revisionID = newRevisionID
// Step 6: Ensure search preset is present
if b.preset != nil {
_, err := b.client.Presets().Upsert(ctx, defaultSearchPresetName, b.preset)
// Step 6: ensure search presets are present
for name, preset := range b.presets {
_, err := b.client.Presets().Upsert(ctx, name, preset)
if err != nil {
b.l.Error("failed to upsert search preset", zap.Error(err))
b.l.Error("failed to upsert preset", zap.String("name", name), zap.Error(err))
return "", err
}
}
@ -264,25 +266,18 @@ func (b *BaseAPI[indexDocument, returnType]) RevertRevision(ctx context.Context,
return nil
}
// SimpleSearch will perform a search operation on the given index
// it will return the documents and the scores
// SimpleSearch will perform a search operation on the given index using basic SearchParameters input
func (b *BaseAPI[indexDocument, returnType]) SimpleSearch(
ctx context.Context,
index pkgx.IndexID,
q string,
filterBy map[string][]string,
page, perPage int,
sortBy string,
parameters *pkgx.SearchParameters,
) ([]returnType, pkgx.Scores, int, error) {
// Call buildSearchParams but also set QueryBy explicitly
parameters := buildSearchParams(q, filterBy, page, perPage, sortBy)
parameters.QueryBy = pointer.String("title")
return b.ExpertSearch(ctx, index, parameters)
searchParams := buildSearchParams(parameters)
return b.ExpertSearch(ctx, index, searchParams)
}
// ExpertSearch will perform a search operation on the given index
// it will return the documents, scores, and totalResults
// ExpertSearch performs a search operation on the given index
// It returns the converted documents, scores, and totalResults
func (b *BaseAPI[indexDocument, returnType]) ExpertSearch(
ctx context.Context,
indexID pkgx.IndexID,
@ -299,6 +294,7 @@ func (b *BaseAPI[indexDocument, returnType]) ExpertSearch(
b.l.Error("failed to perform search", zap.String("index", collectionName), zap.Error(err))
return nil, nil, 0, err
}
// Extract totalResults from the search response
totalResults := *searchResponse.Found
@ -326,19 +322,24 @@ func (b *BaseAPI[indexDocument, returnType]) ExpertSearch(
continue
}
// Convert hit to JSON and then unmarshal into returnType
// Convert raw document (map) to indexDocument struct
hitJSON, err := json.Marshal(docMap)
if err != nil {
b.l.Warn("failed to marshal document to JSON", zap.String("index", collectionName), zap.Error(err))
continue
}
var doc returnType
if err := json.Unmarshal(hitJSON, &doc); err != nil {
b.l.Warn("failed to unmarshal JSON into returnType", zap.String("index", collectionName), zap.Error(err))
var rawDoc indexDocument
if err := json.Unmarshal(hitJSON, &rawDoc); err != nil {
b.l.Warn("failed to unmarshal JSON into indexDocument", zap.String("index", collectionName), zap.Error(err))
continue
}
results[i] = doc
// Convert the raw document using documentConverter
convertedDoc := b.documentConverter(rawDoc)
results[i] = convertedDoc
// Extract search score
index := 0
if hit.TextMatchInfo != nil && hit.TextMatchInfo.Score != nil {
if score, err := strconv.Atoi(*hit.TextMatchInfo.Score); err == nil {

View File

@ -13,50 +13,35 @@ import (
"go.uber.org/zap"
)
const defaultSearchPresetName = "default"
// buildSearchParams will return the search collection parameters
// this is meant as a utility function to create the search collection parameters
// for the typesense search API without any knowledge of the typesense API
func buildSearchParams(
q string,
filterBy map[string][]string,
page, perPage int,
sortBy string,
params *pkgx.SearchParameters,
) *api.SearchCollectionParams {
parameters := &api.SearchCollectionParams{}
parameters.Q = pointer.String(q)
if filterByString := formatFilterQuery(filterBy); filterByString != "" {
parameters.FilterBy = pointer.String(filterByString)
}
parameters.Page = pointer.Int(page)
parameters.PerPage = pointer.Int(perPage)
if sortBy != "" {
parameters.SortBy = pointer.String(sortBy)
if params.Page < 1 {
params.Page = 1
}
return parameters
}
func formatFilterQuery(filterBy map[string][]string) string {
if filterBy == nil {
return ""
searchParams := &api.SearchCollectionParams{
Page: pointer.Int(params.Page),
}
var filterClauses []string
for key, values := range filterBy {
if len(values) == 1 {
// Single value → Use `:=` operator
filterClauses = append(filterClauses, fmt.Sprintf("%s:=\"%s\"", key, values[0]))
} else {
// Multiple values → Use `["val1","val2"]` array syntax
formattedValues := []string{}
for _, v := range values {
formattedValues = append(formattedValues, fmt.Sprintf("\"%s\"", v))
}
filterClauses = append(filterClauses, fmt.Sprintf("%s:[%s]", key, strings.Join(formattedValues, ",")))
}
if params.PresetName != "" {
searchParams.Preset = pointer.String(params.PresetName)
} else {
searchParams.Preset = pointer.String(defaultSearchPresetName)
}
return strings.Join(filterClauses, " && ")
if params.Query != "" {
searchParams.Q = pointer.String(params.Query)
}
if params.Modify != nil {
params.Modify(searchParams)
}
return searchParams
}
func (b *BaseAPI[indexDocument, returnType]) generateRevisionID() pkgx.RevisionID {

View File

@ -3,6 +3,7 @@ package typesenseindexing
import (
"context"
"fmt"
"slices"
contentserverclient "github.com/foomo/contentserver/client"
@ -11,6 +12,8 @@ import (
"go.uber.org/zap"
)
const ContentserverDataAttributeNoIndex = "typesenseIndexing-noIndex"
type ContentServer[indexDocument any] struct {
l *zap.Logger
contentserverClient *contentserverclient.Client
@ -32,6 +35,12 @@ func NewContentServer[indexDocument any](
}
}
// Provide retrieves documents for the given indexID from the content server.
// It fetches the document IDs, retrieves the URLs for those IDs, and then uses the
// document provider functions to create the documents.
// The documents are returned as a slice of pointers to the indexDocument type.
// If a document provider function is not available for a specific document type,
// a warning is logged and that document is skipped.
func (c ContentServer[indexDocument]) Provide(
ctx context.Context,
indexID pkgx.IndexID,
@ -49,7 +58,10 @@ func (c ContentServer[indexDocument]) Provide(
documents := make([]*indexDocument, len(documentInfos))
for index, documentInfo := range documentInfos {
if documentProvider, ok := c.documentProviderFuncs[documentInfo.DocumentType]; !ok {
c.l.Warn("no document provider available for document type", zap.String("documentType", string(documentInfo.DocumentType)))
c.l.Warn(
"no document provider available for document type",
zap.String("documentType", string(documentInfo.DocumentType)),
)
} else {
document, err := documentProvider(ctx, indexID, documentInfo.DocumentID, urlsByIDs)
if err != nil {
@ -69,6 +81,7 @@ func (c ContentServer[indexDocument]) Provide(
return documents, nil
}
// ProvidePaged
func (c ContentServer[indexDocument]) ProvidePaged(
ctx context.Context,
indexID pkgx.IndexID,
@ -95,31 +108,26 @@ func (c ContentServer[indexDocument]) getDocumentIDsByIndexID(
nodeMap := createFlatRepoNodeMap(rootRepoNode, map[string]*content.RepoNode{})
documentInfos := make([]pkgx.DocumentInfo, 0, len(nodeMap))
for _, repoNode := range nodeMap {
if slices.Contains(c.supportedMimeTypes, repoNode.MimeType) {
documentInfos = append(documentInfos, pkgx.DocumentInfo{
DocumentType: pkgx.DocumentType(repoNode.MimeType),
DocumentID: pkgx.DocumentID(repoNode.ID),
})
if !includeNode(c.supportedMimeTypes, repoNode) {
c.l.Debug("skipping document indexing",
zap.String("path", repoNode.URI),
zap.String("mimeType", repoNode.MimeType),
zap.Bool("hidden", repoNode.Hidden),
)
continue
}
documentInfos = append(documentInfos, pkgx.DocumentInfo{
DocumentType: pkgx.DocumentType(repoNode.MimeType),
DocumentID: pkgx.DocumentID(repoNode.ID),
})
}
return documentInfos, nil
}
// createFlatRepoNodeMap recursively retrieves all nodes from the tree and returns them in a flat map.
func createFlatRepoNodeMap(node *content.RepoNode, nodeMap map[string]*content.RepoNode) map[string]*content.RepoNode {
if node == nil {
return nodeMap
}
// Add the current node to the list.
nodeMap[node.ID] = node
// Recursively process child nodes.
for _, child := range node.Nodes {
nodeMap = createFlatRepoNodeMap(child, nodeMap)
}
return nodeMap
}
// fetchURLsByDocumentIDs fetches the URLs for the given document IDs from the content server.
// It uses the contentserverClient to retrieve the URIs and maps them to DocumentID.
func (c ContentServer[indexDocument]) fetchURLsByDocumentIDs(
ctx context.Context,
indexID pkgx.IndexID,
@ -140,6 +148,8 @@ func (c ContentServer[indexDocument]) fetchURLsByDocumentIDs(
return convertMapStringToDocumentID(uriMap), nil
}
// convertMapStringToDocumentID converts a map with string keys to a map with DocumentID keys.
// The keys in the input map are converted to DocumentID type, while the values remain strings.
func convertMapStringToDocumentID(input map[string]string) map[pkgx.DocumentID]string {
output := make(map[pkgx.DocumentID]string, len(input))
for key, value := range input {
@ -147,3 +157,33 @@ func convertMapStringToDocumentID(input map[string]string) map[pkgx.DocumentID]s
}
return output
}
// includeNode checks if the node should be included in the indexing process.
// It checks if the node is nil, if it has the noIndex attribute set to true,
// and if its mime type is in the list of supported mime types.
func includeNode(supportedMimeTypes []string, node *content.RepoNode) bool {
if node == nil {
return false
}
if noIndex, noIndexSet := node.Data[ContentserverDataAttributeNoIndex].(bool); noIndexSet && noIndex {
return false
}
if !slices.Contains(supportedMimeTypes, node.MimeType) {
return false
}
return true
}
// createFlatRepoNodeMap recursively retrieves all nodes from the tree and returns them in a flat map.
func createFlatRepoNodeMap(node *content.RepoNode, nodeMap map[string]*content.RepoNode) map[string]*content.RepoNode {
if node == nil {
return nodeMap
}
// Add the current node to the list.
nodeMap[node.ID] = node
// Recursively process child nodes.
for _, child := range node.Nodes {
nodeMap = createFlatRepoNodeMap(child, nodeMap)
}
return nodeMap
}

View File

@ -17,14 +17,7 @@ type API[indexDocument any, returnType any] interface {
Initialize(ctx context.Context) (RevisionID, error)
// perform a search operation on the given index
SimpleSearch(
ctx context.Context,
index IndexID,
q string,
filterBy map[string][]string,
page, perPage int,
sortBy string,
) ([]returnType, Scores, int, error)
SimpleSearch(ctx context.Context, index IndexID, parameters *SearchParameters) ([]returnType, Scores, int, error)
ExpertSearch(ctx context.Context, index IndexID, parameters *api.SearchCollectionParams) ([]returnType, Scores, int, error)
Healthz(ctx context.Context) error
Indices() ([]IndexID, error)

View File

@ -1,6 +1,10 @@
package typesense
import "context"
import (
"context"
"github.com/typesense/typesense-go/v3/typesense/api"
)
type RevisionID string
type Query string
@ -26,3 +30,10 @@ type DocumentInfo struct {
DocumentType DocumentType
DocumentID DocumentID
}
type SearchParameters struct {
Query string
Page int
PresetName string
Modify func(params *api.SearchCollectionParams)
}