full extraction

2025-12-29 16:16:00 +00:00 · 2025-06-27 17:57:12 +02:00 · 2025-06-27 17:57:12 +02:00 · 30daf29374
commit 30daf29374
parent 481df972cc
3 changed files with 102 additions and 48 deletions
--- a/README.md
+++ b/README.md
@ -48,43 +48,48 @@ OpenDia is an open alternative to Dia. Connect to your browser with MCP & do any
   }
   ```
-## Enhanced MCP Tools (11 Total)
+## Enhanced MCP Tools (17 Total)
-### 🎯 Core Automation Tools (7 Tools)
+### 🌐 Web Browser Automation Tools (8 Tools)
- **page_analyze**: Intelligent page analysis using pattern database + semantic analysis
+- **page_analyze**: Two-phase intelligent page analysis with anti-detection bypass
-  - Finds relevant elements based on user intent (e.g., "post_tweet", "search", "login")
+  - Phase 1 (discover): Quick scan with element state detection
-  - Returns confidence-scored elements with stable IDs
+  - Phase 2 (detailed): Full analysis with element fingerprinting
  - Enhanced pattern database with confidence scoring
  - Supports Twitter/X, GitHub, and universal patterns
- **page_extract_content**: Structured content extraction
+- **page_extract_content**: Smart content extraction with summarization
  - Extract articles, search results, or social media posts
-  - Smart content detection using semantic analysis
+  - Token-efficient summaries with quality metrics
-  - Returns structured data with metadata
+  - Site-specific extraction patterns for Twitter/X, GitHub, Google
- **element_click**: Reliable element clicking
+- **element_click**: Reliable element clicking with smart targeting
  - Uses element IDs from page analysis
  - Supports different click types (left, right, double)
  - Auto-scrolls elements into view
- **element_fill**: Smart form filling with anti-detection bypass
+- **element_fill**: Enhanced form filling with anti-detection bypass
-  - Fill input fields and textareas with specialized bypasses for Twitter/X, LinkedIn, Facebook
+  - Specialized bypasses for Twitter/X, LinkedIn, Facebook
-  - Supports contenteditable elements
+  - Natural focus sequence: click → focus → fill
-  - Uses platform-specific techniques to avoid bot detection
+  - Comprehensive event simulation for modern web apps
- **page_navigate**: Enhanced navigation
+- **element_get_state**: Get detailed element state information
-  - Navigate to URLs with optional wait conditions
+  - Check if elements are disabled, clickable, visible
-  - Wait for specific elements to appear after navigation
+  - Get current values and element properties
  - Essential for conditional automation logic
 - **page_navigate**: Enhanced navigation with wait conditions
  - Navigate to URLs with optional element wait conditions
  - Timeout handling and error reporting
- **page_wait_for**: Conditional waiting
+- **page_wait_for**: Conditional waiting for elements or text
  - Wait for elements to become visible
  - Wait for specific text to appear on page
  - Configurable timeout periods
- **browser_navigate**: URL navigation
+- **page_scroll**: Scroll pages in various directions
-  - Navigate to URLs in the active tab
+  - Critical for long pages and infinite scroll content
-  - Simple navigation tool for compatibility
+  - Supports smooth scrolling and element targeting
 ### 📑 Tab Management Tools (4 Tools)
@ -109,12 +114,30 @@ OpenDia is an open alternative to Dia. Connect to your browser with MCP & do any
  - Focus windows automatically
  - Essential for multi-tab automation workflows
-### 🔧 State Management Tools (1 Tool)
+### 📊 Browser Data Access Tools (5 Tools)
- **element_get_state**: Get detailed state information for elements
+- **get_bookmarks**: Get all bookmarks or search for specific ones
-  - Check if elements are disabled, clickable, visible
+  - Search bookmarks by query string
-  - Get current values and element properties
+  - Returns structured bookmark data with hierarchy
-  - Essential for conditional automation logic
+
 - **add_bookmark**: Add new bookmarks
  - Create bookmarks with title and URL
  - Optional parent folder support for organization
 - **get_history**: Search browser history with comprehensive filters
  - Advanced filtering by date, domains, visit count, keywords
  - Sophisticated sorting and metadata extraction
  - Perfect for finding previous work and research
 - **get_selected_text**: Get currently selected text on the page
  - Rich metadata about selection context and position
  - Includes parent element information and page context
  - Configurable length limits and truncation
 - **get_page_links**: Get all hyperlinks on current page with filtering
  - Smart filtering for internal/external links
  - Domain-specific filtering options
  - Essential for link analysis and navigation planning
 ## 🚀 Key Features
--- a/opendia-extension/content.js
+++ b/opendia-extension/content.js
@ -1188,11 +1188,12 @@ class BrowserAutomation {
  }
  async extractContent({ content_type, max_items = 20, summarize = true }) {
    console.log(`🔍 extractContent called with: content_type=${content_type}, max_items=${max_items}, summarize=${summarize}`);
    const startTime = performance.now();
    const extractors = {
-      article: () => this.extractArticleContent(),
+      article: () => this.extractArticleContent(summarize),
-      search_results: () => this.extractSearchResults(max_items),
+      search_results: () => this.extractSearchResults(max_items, summarize),
-      posts: () => this.extractPosts(max_items),
+      posts: () => this.extractPosts(max_items, summarize),
    };
    const extractor = extractors[content_type];
@ -1218,6 +1219,7 @@ class BrowserAutomation {
      };
    } else {
      // Legacy full content extraction
      console.log(`🎯 Returning FULL content: ${rawContent?.content?.length || 0} characters`);
      return {
        content: rawContent,
        method: "semantic_extraction",
@ -1228,7 +1230,8 @@ class BrowserAutomation {
    }
  }
-  extractArticleContent() {
+  extractArticleContent(summarize = true) {
    console.log(`📄 extractArticleContent called with summarize=${summarize}`);
    const article = document.querySelector(
      'article, [role="article"], .article-content, main'
    );
@ -1237,6 +1240,9 @@ class BrowserAutomation {
      ?.textContent?.trim();
    const content = article?.textContent?.trim() || this.extractMainContent();
    console.log(`📏 Extracted content length: ${content?.length || 0} characters`);
    console.log(`📝 Content preview: ${content?.substring(0, 200)}...`);
    return {
      title,
      content,
@ -1265,7 +1271,7 @@ class BrowserAutomation {
    );
  }
-  extractSearchResults(max_items = 20) {
+  extractSearchResults(max_items = 20, summarize = true) {
    // Common search result patterns
    const selectors = [
      '.search-result, .result-item, [data-testid*="result"]',
@ -1283,8 +1289,8 @@ class BrowserAutomation {
          .slice(0, max_items)
          .map((el, index) => ({
            index: index + 1,
-            title: this.extractResultTitle(el),
+            title: this.extractResultTitle(el, summarize),
-            summary: this.extractResultSummary(el),
+            summary: this.extractResultSummary(el, summarize),
            link: this.extractResultLink(el),
            type: this.detectResultType(el),
            score: this.scoreSearchResult(el),
@ -1296,7 +1302,7 @@ class BrowserAutomation {
    return results;
  }
-  extractPosts(max_items = 20) {
+  extractPosts(max_items = 20, summarize = true) {
    // Social media post patterns
    const selectors = [
      '[data-testid="tweet"], .tweet, .post',
@ -1313,7 +1319,7 @@ class BrowserAutomation {
          .slice(0, max_items)
          .map((el, index) => ({
            index: index + 1,
-            text: this.extractPostText(el),
+            text: this.extractPostText(el, summarize),
            author: this.extractPostAuthor(el),
            timestamp: this.extractPostTimestamp(el),
            metrics: this.extractPostMetrics(el),
@ -1405,24 +1411,30 @@ class BrowserAutomation {
  }
  // Helper methods for extraction
-  extractResultTitle(element) {
+  extractResultTitle(element, summarize = true) {
    const titleSelectors = [
      'h1, h2, h3, .title, .headline, [data-testid*="title"]',
    ];
    for (const selector of titleSelectors) {
      const title = element.querySelector(selector)?.textContent?.trim();
-      if (title) return title.substring(0, 100);
+      if (title) {
        return summarize ? title.substring(0, 100) : title;
      }
    }
-    return element.textContent?.trim()?.substring(0, 50) || "No title";
+    const fallbackTitle = element.textContent?.trim() || "No title";
    return summarize ? fallbackTitle.substring(0, 50) : fallbackTitle;
  }
-  extractResultSummary(element) {
+  extractResultSummary(element, summarize = true) {
    const summarySelectors = [".summary, .description, .snippet, .excerpt"];
    for (const selector of summarySelectors) {
      const summary = element.querySelector(selector)?.textContent?.trim();
-      if (summary) return summary.substring(0, 200);
+      if (summary) {
        return summarize ? summary.substring(0, 200) : summary;
      }
    }
-    return element.textContent?.trim()?.substring(0, 150) || "";
+    const fallbackSummary = element.textContent?.trim() || "";
    return summarize ? fallbackSummary.substring(0, 150) : fallbackSummary;
  }
  extractResultLink(element) {
@ -1453,7 +1465,7 @@ class BrowserAutomation {
    return Math.min(score, 1.0);
  }
-  extractPostText(element) {
+  extractPostText(element, summarize = true) {
    const textSelectors = [
      '[data-testid="tweetText"], .tweet-text',
      ".post-content, .entry-content",
@ -1462,10 +1474,13 @@ class BrowserAutomation {
    for (const selector of textSelectors) {
      const text = element.querySelector(selector)?.textContent?.trim();
-      if (text) return text.substring(0, 280);
+      if (text) {
        return summarize ? text.substring(0, 280) : text;
      }
    }
-    return element.textContent?.trim()?.substring(0, 280) || "";
+    const fallbackText = element.textContent?.trim() || "";
    return summarize ? fallbackText.substring(0, 280) : fallbackText;
  }
  extractPostAuthor(element) {
--- a/opendia-mcp/server.js
+++ b/opendia-mcp/server.js
@ -260,11 +260,27 @@ function formatPageAnalyzeResult(result, metadata) {
 function formatContentExtractionResult(result, metadata) {
  const contentSummary = `Extracted ${result.content_type} content using ${result.method}:\n\n`;
  if (result.content) {
-    const preview =
+    // Check if this is full content extraction (summarize=false) or summary
-      typeof result.content === "string"
+    // If it's a content object with properties, show full content
-        ? result.content.substring(0, 500) +
+    // If it's a string or small content, it's probably summarized
-          (result.content.length > 500 ? "..." : "")
+    let preview;
-        : JSON.stringify(result.content, null, 2).substring(0, 500);
+    if (typeof result.content === "string") {
      // String content - likely summarized, keep truncation
      preview = result.content.substring(0, 500) + (result.content.length > 500 ? "..." : "");
    } else if (result.content && typeof result.content === "object") {
      // Object content - check if it's full content extraction
      if (result.content.content && result.content.content.length > 1000) {
        // This looks like full content extraction - don't truncate
        preview = JSON.stringify(result.content, null, 2);
      } else {
        // Smaller content, apply truncation
        preview = JSON.stringify(result.content, null, 2).substring(0, 500);
      }
    } else {
      // Fallback
      preview = JSON.stringify(result.content, null, 2).substring(0, 500);
    }
    return `${contentSummary}${preview}\n\n${JSON.stringify(
      metadata,
      null,