feat: use noIndex flag to skip documents for indexing

This commit is contained in:
Daniel Thomas 2025-05-21 14:54:24 +02:00
parent 24460f6be7
commit 32f94353e6
2 changed files with 50 additions and 18 deletions

View File

@ -1,4 +1,4 @@
hooks:
hooks:
pre-commit:
- golangci-lint run --fast-only
- husky lint-staged

View File

@ -12,6 +12,8 @@ import (
"go.uber.org/zap"
)
const ContentserverDataAttributeNoIndex = "typesenseIndexing-noIndex"
type ContentServer[indexDocument any] struct {
l *zap.Logger
contentserverClient *contentserverclient.Client
@ -33,6 +35,12 @@ func NewContentServer[indexDocument any](
}
}
// Provide retrieves documents for the given indexID from the content server.
// It fetches the document IDs, retrieves the URLs for those IDs, and then uses the
// document provider functions to create the documents.
// The documents are returned as a slice of pointers to the indexDocument type.
// If a document provider function is not available for a specific document type,
// a warning is logged and that document is skipped.
func (c ContentServer[indexDocument]) Provide(
ctx context.Context,
indexID pkgx.IndexID,
@ -50,7 +58,10 @@ func (c ContentServer[indexDocument]) Provide(
documents := make([]*indexDocument, len(documentInfos))
for index, documentInfo := range documentInfos {
if documentProvider, ok := c.documentProviderFuncs[documentInfo.DocumentType]; !ok {
c.l.Warn("no document provider available for document type", zap.String("documentType", string(documentInfo.DocumentType)))
c.l.Warn(
"no document provider available for document type",
zap.String("documentType", string(documentInfo.DocumentType)),
)
} else {
document, err := documentProvider(ctx, indexID, documentInfo.DocumentID, urlsByIDs)
if err != nil {
@ -70,6 +81,7 @@ func (c ContentServer[indexDocument]) Provide(
return documents, nil
}
// ProvidePaged
func (c ContentServer[indexDocument]) ProvidePaged(
ctx context.Context,
indexID pkgx.IndexID,
@ -96,8 +108,8 @@ func (c ContentServer[indexDocument]) getDocumentIDsByIndexID(
nodeMap := createFlatRepoNodeMap(rootRepoNode, map[string]*content.RepoNode{})
documentInfos := make([]pkgx.DocumentInfo, 0, len(nodeMap))
for _, repoNode := range nodeMap {
if repoNode.Hidden || !slices.Contains(c.supportedMimeTypes, repoNode.MimeType) {
c.l.Warn("Skipping document indexing",
if !includeNode(c.supportedMimeTypes, repoNode) {
c.l.Debug("skipping document indexing",
zap.String("path", repoNode.URI),
zap.String("mimeType", repoNode.MimeType),
zap.Bool("hidden", repoNode.Hidden),
@ -114,20 +126,8 @@ func (c ContentServer[indexDocument]) getDocumentIDsByIndexID(
return documentInfos, nil
}
// createFlatRepoNodeMap recursively retrieves all nodes from the tree and returns them in a flat map.
func createFlatRepoNodeMap(node *content.RepoNode, nodeMap map[string]*content.RepoNode) map[string]*content.RepoNode {
if node == nil {
return nodeMap
}
// Add the current node to the list.
nodeMap[node.ID] = node
// Recursively process child nodes.
for _, child := range node.Nodes {
nodeMap = createFlatRepoNodeMap(child, nodeMap)
}
return nodeMap
}
// fetchURLsByDocumentIDs fetches the URLs for the given document IDs from the content server.
// It uses the contentserverClient to retrieve the URIs and maps them to DocumentID.
func (c ContentServer[indexDocument]) fetchURLsByDocumentIDs(
ctx context.Context,
indexID pkgx.IndexID,
@ -148,6 +148,8 @@ func (c ContentServer[indexDocument]) fetchURLsByDocumentIDs(
return convertMapStringToDocumentID(uriMap), nil
}
// convertMapStringToDocumentID converts a map with string keys to a map with DocumentID keys.
// The keys in the input map are converted to DocumentID type, while the values remain strings.
func convertMapStringToDocumentID(input map[string]string) map[pkgx.DocumentID]string {
output := make(map[pkgx.DocumentID]string, len(input))
for key, value := range input {
@ -155,3 +157,33 @@ func convertMapStringToDocumentID(input map[string]string) map[pkgx.DocumentID]s
}
return output
}
// includeNode checks if the node should be included in the indexing process.
// It checks if the node is nil, if it has the noIndex attribute set to true,
// and if its mime type is in the list of supported mime types.
func includeNode(supportedMimeTypes []string, node *content.RepoNode) bool {
if node == nil {
return false
}
if noIndex, noIndexSet := node.Data[ContentserverDataAttributeNoIndex].(bool); noIndexSet && noIndex {
return false
}
if !slices.Contains(supportedMimeTypes, node.MimeType) {
return false
}
return true
}
// createFlatRepoNodeMap recursively retrieves all nodes from the tree and returns them in a flat map.
func createFlatRepoNodeMap(node *content.RepoNode, nodeMap map[string]*content.RepoNode) map[string]*content.RepoNode {
if node == nil {
return nodeMap
}
// Add the current node to the list.
nodeMap[node.ID] = node
// Recursively process child nodes.
for _, child := range node.Nodes {
nodeMap = createFlatRepoNodeMap(child, nodeMap)
}
return nodeMap
}