feat: use noIndex flag to skip documents for indexing

2025-10-16 12:45:37 +00:00 · 2025-05-21 14:54:24 +02:00 · 2025-05-21 14:54:24 +02:00 · 32f94353e6
commit 32f94353e6
parent 24460f6be7
2 changed files with 50 additions and 18 deletions
--- a/.husky.yaml
+++ b/.husky.yaml
@ -1,4 +1,4 @@
- hooks:
+hooks:
  pre-commit:
    - golangci-lint run --fast-only
    - husky lint-staged
--- a/pkg/indexing/contentserver.go
+++ b/pkg/indexing/contentserver.go
@ -12,6 +12,8 @@ import (
 	"go.uber.org/zap"
 )

+const ContentserverDataAttributeNoIndex = "typesenseIndexing-noIndex"
+
 type ContentServer[indexDocument any] struct {
 	l                     *zap.Logger
 	contentserverClient   *contentserverclient.Client
@ -33,6 +35,12 @@ func NewContentServer[indexDocument any](
 	}
 }

+// Provide retrieves documents for the given indexID from the content server.
+// It fetches the document IDs, retrieves the URLs for those IDs, and then uses the
+// document provider functions to create the documents.
+// The documents are returned as a slice of pointers to the indexDocument type.
+// If a document provider function is not available for a specific document type,
+// a warning is logged and that document is skipped.
 func (c ContentServer[indexDocument]) Provide(
 	ctx context.Context,
 	indexID pkgx.IndexID,
@ -50,7 +58,10 @@ func (c ContentServer[indexDocument]) Provide(
 	documents := make([]*indexDocument, len(documentInfos))
 	for index, documentInfo := range documentInfos {
 		if documentProvider, ok := c.documentProviderFuncs[documentInfo.DocumentType]; !ok {
-			c.l.Warn("no document provider available for document type", zap.String("documentType", string(documentInfo.DocumentType)))
+			c.l.Warn(
+				"no document provider available for document type",
+				zap.String("documentType", string(documentInfo.DocumentType)),
+			)
 		} else {
 			document, err := documentProvider(ctx, indexID, documentInfo.DocumentID, urlsByIDs)
 			if err != nil {
@ -70,6 +81,7 @@ func (c ContentServer[indexDocument]) Provide(
 	return documents, nil
 }

+// ProvidePaged
 func (c ContentServer[indexDocument]) ProvidePaged(
 	ctx context.Context,
 	indexID pkgx.IndexID,
@ -96,8 +108,8 @@ func (c ContentServer[indexDocument]) getDocumentIDsByIndexID(
 	nodeMap := createFlatRepoNodeMap(rootRepoNode, map[string]*content.RepoNode{})
 	documentInfos := make([]pkgx.DocumentInfo, 0, len(nodeMap))
 	for _, repoNode := range nodeMap {
-		if repoNode.Hidden || !slices.Contains(c.supportedMimeTypes, repoNode.MimeType) {
-			c.l.Warn("Skipping document indexing",
+		if !includeNode(c.supportedMimeTypes, repoNode) {
+			c.l.Debug("skipping document indexing",
 				zap.String("path", repoNode.URI),
 				zap.String("mimeType", repoNode.MimeType),
 				zap.Bool("hidden", repoNode.Hidden),
@ -114,20 +126,8 @@ func (c ContentServer[indexDocument]) getDocumentIDsByIndexID(
 	return documentInfos, nil
 }

-// createFlatRepoNodeMap recursively retrieves all nodes from the tree and returns them in a flat map.
-func createFlatRepoNodeMap(node *content.RepoNode, nodeMap map[string]*content.RepoNode) map[string]*content.RepoNode {
-	if node == nil {
-		return nodeMap
-	}
-	// Add the current node to the list.
-	nodeMap[node.ID] = node
-	// Recursively process child nodes.
-	for _, child := range node.Nodes {
-		nodeMap = createFlatRepoNodeMap(child, nodeMap)
-	}
-	return nodeMap
-}
-
+// fetchURLsByDocumentIDs fetches the URLs for the given document IDs from the content server.
+// It uses the contentserverClient to retrieve the URIs and maps them to DocumentID.
 func (c ContentServer[indexDocument]) fetchURLsByDocumentIDs(
 	ctx context.Context,
 	indexID pkgx.IndexID,
@ -148,6 +148,8 @@ func (c ContentServer[indexDocument]) fetchURLsByDocumentIDs(
 	return convertMapStringToDocumentID(uriMap), nil
 }

+// convertMapStringToDocumentID converts a map with string keys to a map with DocumentID keys.
+// The keys in the input map are converted to DocumentID type, while the values remain strings.
 func convertMapStringToDocumentID(input map[string]string) map[pkgx.DocumentID]string {
 	output := make(map[pkgx.DocumentID]string, len(input))
 	for key, value := range input {
@ -155,3 +157,33 @@ func convertMapStringToDocumentID(input map[string]string) map[pkgx.DocumentID]s
 	}
 	return output
 }
+
+// includeNode checks if the node should be included in the indexing process.
+// It checks if the node is nil, if it has the noIndex attribute set to true,
+// and if its mime type is in the list of supported mime types.
+func includeNode(supportedMimeTypes []string, node *content.RepoNode) bool {
+	if node == nil {
+		return false
+	}
+	if noIndex, noIndexSet := node.Data[ContentserverDataAttributeNoIndex].(bool); noIndexSet && noIndex {
+		return false
+	}
+	if !slices.Contains(supportedMimeTypes, node.MimeType) {
+		return false
+	}
+	return true
+}
+
+// createFlatRepoNodeMap recursively retrieves all nodes from the tree and returns them in a flat map.
+func createFlatRepoNodeMap(node *content.RepoNode, nodeMap map[string]*content.RepoNode) map[string]*content.RepoNode {
+	if node == nil {
+		return nodeMap
+	}
+	// Add the current node to the list.
+	nodeMap[node.ID] = node
+	// Recursively process child nodes.
+	for _, child := range node.Nodes {
+		nodeMap = createFlatRepoNodeMap(child, nodeMap)
+	}
+	return nodeMap
+}