From 32f94353e6172b3aa947388c0f71826803b7f6e3 Mon Sep 17 00:00:00 2001 From: Daniel Thomas Date: Wed, 21 May 2025 14:54:24 +0200 Subject: [PATCH] feat: use noIndex flag to skip documents for indexing --- .husky.yaml | 2 +- pkg/indexing/contentserver.go | 66 ++++++++++++++++++++++++++--------- 2 files changed, 50 insertions(+), 18 deletions(-) diff --git a/.husky.yaml b/.husky.yaml index b82e55d..0284970 100644 --- a/.husky.yaml +++ b/.husky.yaml @@ -1,4 +1,4 @@ - hooks: +hooks: pre-commit: - golangci-lint run --fast-only - husky lint-staged diff --git a/pkg/indexing/contentserver.go b/pkg/indexing/contentserver.go index 3cb9280..4c4a75b 100644 --- a/pkg/indexing/contentserver.go +++ b/pkg/indexing/contentserver.go @@ -12,6 +12,8 @@ import ( "go.uber.org/zap" ) +const ContentserverDataAttributeNoIndex = "typesenseIndexing-noIndex" + type ContentServer[indexDocument any] struct { l *zap.Logger contentserverClient *contentserverclient.Client @@ -33,6 +35,12 @@ func NewContentServer[indexDocument any]( } } +// Provide retrieves documents for the given indexID from the content server. +// It fetches the document IDs, retrieves the URLs for those IDs, and then uses the +// document provider functions to create the documents. +// The documents are returned as a slice of pointers to the indexDocument type. +// If a document provider function is not available for a specific document type, +// a warning is logged and that document is skipped. func (c ContentServer[indexDocument]) Provide( ctx context.Context, indexID pkgx.IndexID, @@ -50,7 +58,10 @@ func (c ContentServer[indexDocument]) Provide( documents := make([]*indexDocument, len(documentInfos)) for index, documentInfo := range documentInfos { if documentProvider, ok := c.documentProviderFuncs[documentInfo.DocumentType]; !ok { - c.l.Warn("no document provider available for document type", zap.String("documentType", string(documentInfo.DocumentType))) + c.l.Warn( + "no document provider available for document type", + zap.String("documentType", string(documentInfo.DocumentType)), + ) } else { document, err := documentProvider(ctx, indexID, documentInfo.DocumentID, urlsByIDs) if err != nil { @@ -70,6 +81,7 @@ func (c ContentServer[indexDocument]) Provide( return documents, nil } +// ProvidePaged func (c ContentServer[indexDocument]) ProvidePaged( ctx context.Context, indexID pkgx.IndexID, @@ -96,8 +108,8 @@ func (c ContentServer[indexDocument]) getDocumentIDsByIndexID( nodeMap := createFlatRepoNodeMap(rootRepoNode, map[string]*content.RepoNode{}) documentInfos := make([]pkgx.DocumentInfo, 0, len(nodeMap)) for _, repoNode := range nodeMap { - if repoNode.Hidden || !slices.Contains(c.supportedMimeTypes, repoNode.MimeType) { - c.l.Warn("Skipping document indexing", + if !includeNode(c.supportedMimeTypes, repoNode) { + c.l.Debug("skipping document indexing", zap.String("path", repoNode.URI), zap.String("mimeType", repoNode.MimeType), zap.Bool("hidden", repoNode.Hidden), @@ -114,20 +126,8 @@ func (c ContentServer[indexDocument]) getDocumentIDsByIndexID( return documentInfos, nil } -// createFlatRepoNodeMap recursively retrieves all nodes from the tree and returns them in a flat map. -func createFlatRepoNodeMap(node *content.RepoNode, nodeMap map[string]*content.RepoNode) map[string]*content.RepoNode { - if node == nil { - return nodeMap - } - // Add the current node to the list. - nodeMap[node.ID] = node - // Recursively process child nodes. - for _, child := range node.Nodes { - nodeMap = createFlatRepoNodeMap(child, nodeMap) - } - return nodeMap -} - +// fetchURLsByDocumentIDs fetches the URLs for the given document IDs from the content server. +// It uses the contentserverClient to retrieve the URIs and maps them to DocumentID. func (c ContentServer[indexDocument]) fetchURLsByDocumentIDs( ctx context.Context, indexID pkgx.IndexID, @@ -148,6 +148,8 @@ func (c ContentServer[indexDocument]) fetchURLsByDocumentIDs( return convertMapStringToDocumentID(uriMap), nil } +// convertMapStringToDocumentID converts a map with string keys to a map with DocumentID keys. +// The keys in the input map are converted to DocumentID type, while the values remain strings. func convertMapStringToDocumentID(input map[string]string) map[pkgx.DocumentID]string { output := make(map[pkgx.DocumentID]string, len(input)) for key, value := range input { @@ -155,3 +157,33 @@ func convertMapStringToDocumentID(input map[string]string) map[pkgx.DocumentID]s } return output } + +// includeNode checks if the node should be included in the indexing process. +// It checks if the node is nil, if it has the noIndex attribute set to true, +// and if its mime type is in the list of supported mime types. +func includeNode(supportedMimeTypes []string, node *content.RepoNode) bool { + if node == nil { + return false + } + if noIndex, noIndexSet := node.Data[ContentserverDataAttributeNoIndex].(bool); noIndexSet && noIndex { + return false + } + if !slices.Contains(supportedMimeTypes, node.MimeType) { + return false + } + return true +} + +// createFlatRepoNodeMap recursively retrieves all nodes from the tree and returns them in a flat map. +func createFlatRepoNodeMap(node *content.RepoNode, nodeMap map[string]*content.RepoNode) map[string]*content.RepoNode { + if node == nil { + return nodeMap + } + // Add the current node to the list. + nodeMap[node.ID] = node + // Recursively process child nodes. + for _, child := range node.Nodes { + nodeMap = createFlatRepoNodeMap(child, nodeMap) + } + return nodeMap +}