full extraction

This commit is contained in:
Aaron Elijah Mars 2025-06-27 17:57:12 +02:00
parent 481df972cc
commit 30daf29374
3 changed files with 102 additions and 48 deletions

View File

@ -48,43 +48,48 @@ OpenDia is an open alternative to Dia. Connect to your browser with MCP & do any
}
```
## Enhanced MCP Tools (11 Total)
## Enhanced MCP Tools (17 Total)
### 🎯 Core Automation Tools (7 Tools)
### 🌐 Web Browser Automation Tools (8 Tools)
- **page_analyze**: Intelligent page analysis using pattern database + semantic analysis
- Finds relevant elements based on user intent (e.g., "post_tweet", "search", "login")
- Returns confidence-scored elements with stable IDs
- **page_analyze**: Two-phase intelligent page analysis with anti-detection bypass
- Phase 1 (discover): Quick scan with element state detection
- Phase 2 (detailed): Full analysis with element fingerprinting
- Enhanced pattern database with confidence scoring
- Supports Twitter/X, GitHub, and universal patterns
- **page_extract_content**: Structured content extraction
- **page_extract_content**: Smart content extraction with summarization
- Extract articles, search results, or social media posts
- Smart content detection using semantic analysis
- Returns structured data with metadata
- Token-efficient summaries with quality metrics
- Site-specific extraction patterns for Twitter/X, GitHub, Google
- **element_click**: Reliable element clicking
- **element_click**: Reliable element clicking with smart targeting
- Uses element IDs from page analysis
- Supports different click types (left, right, double)
- Auto-scrolls elements into view
- **element_fill**: Smart form filling with anti-detection bypass
- Fill input fields and textareas with specialized bypasses for Twitter/X, LinkedIn, Facebook
- Supports contenteditable elements
- Uses platform-specific techniques to avoid bot detection
- **element_fill**: Enhanced form filling with anti-detection bypass
- Specialized bypasses for Twitter/X, LinkedIn, Facebook
- Natural focus sequence: click → focus → fill
- Comprehensive event simulation for modern web apps
- **page_navigate**: Enhanced navigation
- Navigate to URLs with optional wait conditions
- Wait for specific elements to appear after navigation
- **element_get_state**: Get detailed element state information
- Check if elements are disabled, clickable, visible
- Get current values and element properties
- Essential for conditional automation logic
- **page_navigate**: Enhanced navigation with wait conditions
- Navigate to URLs with optional element wait conditions
- Timeout handling and error reporting
- **page_wait_for**: Conditional waiting
- **page_wait_for**: Conditional waiting for elements or text
- Wait for elements to become visible
- Wait for specific text to appear on page
- Configurable timeout periods
- **browser_navigate**: URL navigation
- Navigate to URLs in the active tab
- Simple navigation tool for compatibility
- **page_scroll**: Scroll pages in various directions
- Critical for long pages and infinite scroll content
- Supports smooth scrolling and element targeting
### 📑 Tab Management Tools (4 Tools)
@ -109,12 +114,30 @@ OpenDia is an open alternative to Dia. Connect to your browser with MCP & do any
- Focus windows automatically
- Essential for multi-tab automation workflows
### 🔧 State Management Tools (1 Tool)
### 📊 Browser Data Access Tools (5 Tools)
- **element_get_state**: Get detailed state information for elements
- Check if elements are disabled, clickable, visible
- Get current values and element properties
- Essential for conditional automation logic
- **get_bookmarks**: Get all bookmarks or search for specific ones
- Search bookmarks by query string
- Returns structured bookmark data with hierarchy
- **add_bookmark**: Add new bookmarks
- Create bookmarks with title and URL
- Optional parent folder support for organization
- **get_history**: Search browser history with comprehensive filters
- Advanced filtering by date, domains, visit count, keywords
- Sophisticated sorting and metadata extraction
- Perfect for finding previous work and research
- **get_selected_text**: Get currently selected text on the page
- Rich metadata about selection context and position
- Includes parent element information and page context
- Configurable length limits and truncation
- **get_page_links**: Get all hyperlinks on current page with filtering
- Smart filtering for internal/external links
- Domain-specific filtering options
- Essential for link analysis and navigation planning
## 🚀 Key Features

View File

@ -1188,11 +1188,12 @@ class BrowserAutomation {
}
async extractContent({ content_type, max_items = 20, summarize = true }) {
console.log(`🔍 extractContent called with: content_type=${content_type}, max_items=${max_items}, summarize=${summarize}`);
const startTime = performance.now();
const extractors = {
article: () => this.extractArticleContent(),
search_results: () => this.extractSearchResults(max_items),
posts: () => this.extractPosts(max_items),
article: () => this.extractArticleContent(summarize),
search_results: () => this.extractSearchResults(max_items, summarize),
posts: () => this.extractPosts(max_items, summarize),
};
const extractor = extractors[content_type];
@ -1218,6 +1219,7 @@ class BrowserAutomation {
};
} else {
// Legacy full content extraction
console.log(`🎯 Returning FULL content: ${rawContent?.content?.length || 0} characters`);
return {
content: rawContent,
method: "semantic_extraction",
@ -1228,7 +1230,8 @@ class BrowserAutomation {
}
}
extractArticleContent() {
extractArticleContent(summarize = true) {
console.log(`📄 extractArticleContent called with summarize=${summarize}`);
const article = document.querySelector(
'article, [role="article"], .article-content, main'
);
@ -1237,6 +1240,9 @@ class BrowserAutomation {
?.textContent?.trim();
const content = article?.textContent?.trim() || this.extractMainContent();
console.log(`📏 Extracted content length: ${content?.length || 0} characters`);
console.log(`📝 Content preview: ${content?.substring(0, 200)}...`);
return {
title,
content,
@ -1265,7 +1271,7 @@ class BrowserAutomation {
);
}
extractSearchResults(max_items = 20) {
extractSearchResults(max_items = 20, summarize = true) {
// Common search result patterns
const selectors = [
'.search-result, .result-item, [data-testid*="result"]',
@ -1283,8 +1289,8 @@ class BrowserAutomation {
.slice(0, max_items)
.map((el, index) => ({
index: index + 1,
title: this.extractResultTitle(el),
summary: this.extractResultSummary(el),
title: this.extractResultTitle(el, summarize),
summary: this.extractResultSummary(el, summarize),
link: this.extractResultLink(el),
type: this.detectResultType(el),
score: this.scoreSearchResult(el),
@ -1296,7 +1302,7 @@ class BrowserAutomation {
return results;
}
extractPosts(max_items = 20) {
extractPosts(max_items = 20, summarize = true) {
// Social media post patterns
const selectors = [
'[data-testid="tweet"], .tweet, .post',
@ -1313,7 +1319,7 @@ class BrowserAutomation {
.slice(0, max_items)
.map((el, index) => ({
index: index + 1,
text: this.extractPostText(el),
text: this.extractPostText(el, summarize),
author: this.extractPostAuthor(el),
timestamp: this.extractPostTimestamp(el),
metrics: this.extractPostMetrics(el),
@ -1405,24 +1411,30 @@ class BrowserAutomation {
}
// Helper methods for extraction
extractResultTitle(element) {
extractResultTitle(element, summarize = true) {
const titleSelectors = [
'h1, h2, h3, .title, .headline, [data-testid*="title"]',
];
for (const selector of titleSelectors) {
const title = element.querySelector(selector)?.textContent?.trim();
if (title) return title.substring(0, 100);
if (title) {
return summarize ? title.substring(0, 100) : title;
}
}
return element.textContent?.trim()?.substring(0, 50) || "No title";
const fallbackTitle = element.textContent?.trim() || "No title";
return summarize ? fallbackTitle.substring(0, 50) : fallbackTitle;
}
extractResultSummary(element) {
extractResultSummary(element, summarize = true) {
const summarySelectors = [".summary, .description, .snippet, .excerpt"];
for (const selector of summarySelectors) {
const summary = element.querySelector(selector)?.textContent?.trim();
if (summary) return summary.substring(0, 200);
if (summary) {
return summarize ? summary.substring(0, 200) : summary;
}
}
return element.textContent?.trim()?.substring(0, 150) || "";
const fallbackSummary = element.textContent?.trim() || "";
return summarize ? fallbackSummary.substring(0, 150) : fallbackSummary;
}
extractResultLink(element) {
@ -1453,7 +1465,7 @@ class BrowserAutomation {
return Math.min(score, 1.0);
}
extractPostText(element) {
extractPostText(element, summarize = true) {
const textSelectors = [
'[data-testid="tweetText"], .tweet-text',
".post-content, .entry-content",
@ -1462,10 +1474,13 @@ class BrowserAutomation {
for (const selector of textSelectors) {
const text = element.querySelector(selector)?.textContent?.trim();
if (text) return text.substring(0, 280);
if (text) {
return summarize ? text.substring(0, 280) : text;
}
}
return element.textContent?.trim()?.substring(0, 280) || "";
const fallbackText = element.textContent?.trim() || "";
return summarize ? fallbackText.substring(0, 280) : fallbackText;
}
extractPostAuthor(element) {

View File

@ -260,11 +260,27 @@ function formatPageAnalyzeResult(result, metadata) {
function formatContentExtractionResult(result, metadata) {
const contentSummary = `Extracted ${result.content_type} content using ${result.method}:\n\n`;
if (result.content) {
const preview =
typeof result.content === "string"
? result.content.substring(0, 500) +
(result.content.length > 500 ? "..." : "")
: JSON.stringify(result.content, null, 2).substring(0, 500);
// Check if this is full content extraction (summarize=false) or summary
// If it's a content object with properties, show full content
// If it's a string or small content, it's probably summarized
let preview;
if (typeof result.content === "string") {
// String content - likely summarized, keep truncation
preview = result.content.substring(0, 500) + (result.content.length > 500 ? "..." : "");
} else if (result.content && typeof result.content === "object") {
// Object content - check if it's full content extraction
if (result.content.content && result.content.content.length > 1000) {
// This looks like full content extraction - don't truncate
preview = JSON.stringify(result.content, null, 2);
} else {
// Smaller content, apply truncation
preview = JSON.stringify(result.content, null, 2).substring(0, 500);
}
} else {
// Fallback
preview = JSON.stringify(result.content, null, 2).substring(0, 500);
}
return `${contentSummary}${preview}\n\n${JSON.stringify(
metadata,
null,