full extraction

2025-12-29 16:16:00 +00:00 · 2025-06-27 17:57:12 +02:00
parent 481df972cc
commit 30daf29374
3 changed files with 102 additions and 48 deletions
--- a/README.md
+++ b/README.md
@@ -48,43 +48,48 @@ OpenDia is an open alternative to Dia. Connect to your browser with MCP & do any
   }
   ```

-## Enhanced MCP Tools (11 Total)
+## Enhanced MCP Tools (17 Total)

-### 🎯 Core Automation Tools (7 Tools)
+### 🌐 Web Browser Automation Tools (8 Tools)

- **page_analyze**: Intelligent page analysis using pattern database + semantic analysis
-  - Finds relevant elements based on user intent (e.g., "post_tweet", "search", "login")
-  - Returns confidence-scored elements with stable IDs
+- **page_analyze**: Two-phase intelligent page analysis with anti-detection bypass
+  - Phase 1 (discover): Quick scan with element state detection
+  - Phase 2 (detailed): Full analysis with element fingerprinting
+  - Enhanced pattern database with confidence scoring
  - Supports Twitter/X, GitHub, and universal patterns

- **page_extract_content**: Structured content extraction
+- **page_extract_content**: Smart content extraction with summarization
  - Extract articles, search results, or social media posts
-  - Smart content detection using semantic analysis
-  - Returns structured data with metadata
+  - Token-efficient summaries with quality metrics
+  - Site-specific extraction patterns for Twitter/X, GitHub, Google

- **element_click**: Reliable element clicking
+- **element_click**: Reliable element clicking with smart targeting
  - Uses element IDs from page analysis
  - Supports different click types (left, right, double)
  - Auto-scrolls elements into view

- **element_fill**: Smart form filling with anti-detection bypass
-  - Fill input fields and textareas with specialized bypasses for Twitter/X, LinkedIn, Facebook
-  - Supports contenteditable elements
-  - Uses platform-specific techniques to avoid bot detection
+- **element_fill**: Enhanced form filling with anti-detection bypass
+  - Specialized bypasses for Twitter/X, LinkedIn, Facebook
+  - Natural focus sequence: click → focus → fill
+  - Comprehensive event simulation for modern web apps

- **page_navigate**: Enhanced navigation
-  - Navigate to URLs with optional wait conditions
-  - Wait for specific elements to appear after navigation
+- **element_get_state**: Get detailed element state information
+  - Check if elements are disabled, clickable, visible
+  - Get current values and element properties
+  - Essential for conditional automation logic
+
+- **page_navigate**: Enhanced navigation with wait conditions
+  - Navigate to URLs with optional element wait conditions
  - Timeout handling and error reporting

- **page_wait_for**: Conditional waiting
+- **page_wait_for**: Conditional waiting for elements or text
  - Wait for elements to become visible
  - Wait for specific text to appear on page
  - Configurable timeout periods

- **browser_navigate**: URL navigation
-  - Navigate to URLs in the active tab
-  - Simple navigation tool for compatibility
+- **page_scroll**: Scroll pages in various directions
+  - Critical for long pages and infinite scroll content
+  - Supports smooth scrolling and element targeting

 ### 📑 Tab Management Tools (4 Tools)

@@ -109,12 +114,30 @@ OpenDia is an open alternative to Dia. Connect to your browser with MCP & do any
  - Focus windows automatically
  - Essential for multi-tab automation workflows

-### 🔧 State Management Tools (1 Tool)
+### 📊 Browser Data Access Tools (5 Tools)

- **element_get_state**: Get detailed state information for elements
-  - Check if elements are disabled, clickable, visible
-  - Get current values and element properties
-  - Essential for conditional automation logic
+- **get_bookmarks**: Get all bookmarks or search for specific ones
+  - Search bookmarks by query string
+  - Returns structured bookmark data with hierarchy
+
+- **add_bookmark**: Add new bookmarks
+  - Create bookmarks with title and URL
+  - Optional parent folder support for organization
+
+- **get_history**: Search browser history with comprehensive filters
+  - Advanced filtering by date, domains, visit count, keywords
+  - Sophisticated sorting and metadata extraction
+  - Perfect for finding previous work and research
+
+- **get_selected_text**: Get currently selected text on the page
+  - Rich metadata about selection context and position
+  - Includes parent element information and page context
+  - Configurable length limits and truncation
+
+- **get_page_links**: Get all hyperlinks on current page with filtering
+  - Smart filtering for internal/external links
+  - Domain-specific filtering options
+  - Essential for link analysis and navigation planning

 ## 🚀 Key Features

--- a/opendia-extension/content.js
+++ b/opendia-extension/content.js
@@ -1188,11 +1188,12 @@ class BrowserAutomation {
  }

  async extractContent({ content_type, max_items = 20, summarize = true }) {
+    console.log(`🔍 extractContent called with: content_type=${content_type}, max_items=${max_items}, summarize=${summarize}`);
    const startTime = performance.now();
    const extractors = {
-      article: () => this.extractArticleContent(),
-      search_results: () => this.extractSearchResults(max_items),
-      posts: () => this.extractPosts(max_items),
+      article: () => this.extractArticleContent(summarize),
+      search_results: () => this.extractSearchResults(max_items, summarize),
+      posts: () => this.extractPosts(max_items, summarize),
    };

    const extractor = extractors[content_type];
@@ -1218,6 +1219,7 @@ class BrowserAutomation {
      };
    } else {
      // Legacy full content extraction
+      console.log(`🎯 Returning FULL content: ${rawContent?.content?.length || 0} characters`);
      return {
        content: rawContent,
        method: "semantic_extraction",
@@ -1228,7 +1230,8 @@ class BrowserAutomation {
    }
  }

-  extractArticleContent() {
+  extractArticleContent(summarize = true) {
+    console.log(`📄 extractArticleContent called with summarize=${summarize}`);
    const article = document.querySelector(
      'article, [role="article"], .article-content, main'
    );
@@ -1237,6 +1240,9 @@ class BrowserAutomation {
      ?.textContent?.trim();
    const content = article?.textContent?.trim() || this.extractMainContent();

+    console.log(`📏 Extracted content length: ${content?.length || 0} characters`);
+    console.log(`📝 Content preview: ${content?.substring(0, 200)}...`);
+
    return {
      title,
      content,
@@ -1265,7 +1271,7 @@ class BrowserAutomation {
    );
  }

-  extractSearchResults(max_items = 20) {
+  extractSearchResults(max_items = 20, summarize = true) {
    // Common search result patterns
    const selectors = [
      '.search-result, .result-item, [data-testid*="result"]',
@@ -1283,8 +1289,8 @@ class BrowserAutomation {
          .slice(0, max_items)
          .map((el, index) => ({
            index: index + 1,
-            title: this.extractResultTitle(el),
-            summary: this.extractResultSummary(el),
+            title: this.extractResultTitle(el, summarize),
+            summary: this.extractResultSummary(el, summarize),
            link: this.extractResultLink(el),
            type: this.detectResultType(el),
            score: this.scoreSearchResult(el),
@@ -1296,7 +1302,7 @@ class BrowserAutomation {
    return results;
  }

-  extractPosts(max_items = 20) {
+  extractPosts(max_items = 20, summarize = true) {
    // Social media post patterns
    const selectors = [
      '[data-testid="tweet"], .tweet, .post',
@@ -1313,7 +1319,7 @@ class BrowserAutomation {
          .slice(0, max_items)
          .map((el, index) => ({
            index: index + 1,
-            text: this.extractPostText(el),
+            text: this.extractPostText(el, summarize),
            author: this.extractPostAuthor(el),
            timestamp: this.extractPostTimestamp(el),
            metrics: this.extractPostMetrics(el),
@@ -1405,24 +1411,30 @@ class BrowserAutomation {
  }

  // Helper methods for extraction
-  extractResultTitle(element) {
+  extractResultTitle(element, summarize = true) {
    const titleSelectors = [
      'h1, h2, h3, .title, .headline, [data-testid*="title"]',
    ];
    for (const selector of titleSelectors) {
      const title = element.querySelector(selector)?.textContent?.trim();
-      if (title) return title.substring(0, 100);
+      if (title) {
+        return summarize ? title.substring(0, 100) : title;
      }
-    return element.textContent?.trim()?.substring(0, 50) || "No title";
+    }
+    const fallbackTitle = element.textContent?.trim() || "No title";
+    return summarize ? fallbackTitle.substring(0, 50) : fallbackTitle;
  }

-  extractResultSummary(element) {
+  extractResultSummary(element, summarize = true) {
    const summarySelectors = [".summary, .description, .snippet, .excerpt"];
    for (const selector of summarySelectors) {
      const summary = element.querySelector(selector)?.textContent?.trim();
-      if (summary) return summary.substring(0, 200);
+      if (summary) {
+        return summarize ? summary.substring(0, 200) : summary;
      }
-    return element.textContent?.trim()?.substring(0, 150) || "";
+    }
+    const fallbackSummary = element.textContent?.trim() || "";
+    return summarize ? fallbackSummary.substring(0, 150) : fallbackSummary;
  }

  extractResultLink(element) {
@@ -1453,7 +1465,7 @@ class BrowserAutomation {
    return Math.min(score, 1.0);
  }

-  extractPostText(element) {
+  extractPostText(element, summarize = true) {
    const textSelectors = [
      '[data-testid="tweetText"], .tweet-text',
      ".post-content, .entry-content",
@@ -1462,10 +1474,13 @@ class BrowserAutomation {

    for (const selector of textSelectors) {
      const text = element.querySelector(selector)?.textContent?.trim();
-      if (text) return text.substring(0, 280);
+      if (text) {
+        return summarize ? text.substring(0, 280) : text;
+      }
    }

-    return element.textContent?.trim()?.substring(0, 280) || "";
+    const fallbackText = element.textContent?.trim() || "";
+    return summarize ? fallbackText.substring(0, 280) : fallbackText;
  }

  extractPostAuthor(element) {
--- a/opendia-mcp/server.js
+++ b/opendia-mcp/server.js
@@ -260,11 +260,27 @@ function formatPageAnalyzeResult(result, metadata) {
 function formatContentExtractionResult(result, metadata) {
  const contentSummary = `Extracted ${result.content_type} content using ${result.method}:\n\n`;
  if (result.content) {
-    const preview =
-      typeof result.content === "string"
-        ? result.content.substring(0, 500) +
-          (result.content.length > 500 ? "..." : "")
-        : JSON.stringify(result.content, null, 2).substring(0, 500);
+    // Check if this is full content extraction (summarize=false) or summary
+    // If it's a content object with properties, show full content
+    // If it's a string or small content, it's probably summarized
+    let preview;
+    if (typeof result.content === "string") {
+      // String content - likely summarized, keep truncation
+      preview = result.content.substring(0, 500) + (result.content.length > 500 ? "..." : "");
+    } else if (result.content && typeof result.content === "object") {
+      // Object content - check if it's full content extraction
+      if (result.content.content && result.content.content.length > 1000) {
+        // This looks like full content extraction - don't truncate
+        preview = JSON.stringify(result.content, null, 2);
+      } else {
+        // Smaller content, apply truncation
+        preview = JSON.stringify(result.content, null, 2).substring(0, 500);
+      }
+    } else {
+      // Fallback
+      preview = JSON.stringify(result.content, null, 2).substring(0, 500);
+    }
+    
    return `${contentSummary}${preview}\n\n${JSON.stringify(
      metadata,
      null,