mirror of
https://github.com/aaronjmars/opendia.git
synced 2025-12-17 17:56:36 +00:00
full extraction
This commit is contained in:
parent
481df972cc
commit
30daf29374
73
README.md
73
README.md
@ -48,43 +48,48 @@ OpenDia is an open alternative to Dia. Connect to your browser with MCP & do any
|
||||
}
|
||||
```
|
||||
|
||||
## Enhanced MCP Tools (11 Total)
|
||||
## Enhanced MCP Tools (17 Total)
|
||||
|
||||
### 🎯 Core Automation Tools (7 Tools)
|
||||
### 🌐 Web Browser Automation Tools (8 Tools)
|
||||
|
||||
- **page_analyze**: Intelligent page analysis using pattern database + semantic analysis
|
||||
- Finds relevant elements based on user intent (e.g., "post_tweet", "search", "login")
|
||||
- Returns confidence-scored elements with stable IDs
|
||||
- **page_analyze**: Two-phase intelligent page analysis with anti-detection bypass
|
||||
- Phase 1 (discover): Quick scan with element state detection
|
||||
- Phase 2 (detailed): Full analysis with element fingerprinting
|
||||
- Enhanced pattern database with confidence scoring
|
||||
- Supports Twitter/X, GitHub, and universal patterns
|
||||
|
||||
- **page_extract_content**: Structured content extraction
|
||||
- **page_extract_content**: Smart content extraction with summarization
|
||||
- Extract articles, search results, or social media posts
|
||||
- Smart content detection using semantic analysis
|
||||
- Returns structured data with metadata
|
||||
- Token-efficient summaries with quality metrics
|
||||
- Site-specific extraction patterns for Twitter/X, GitHub, Google
|
||||
|
||||
- **element_click**: Reliable element clicking
|
||||
- **element_click**: Reliable element clicking with smart targeting
|
||||
- Uses element IDs from page analysis
|
||||
- Supports different click types (left, right, double)
|
||||
- Auto-scrolls elements into view
|
||||
|
||||
- **element_fill**: Smart form filling with anti-detection bypass
|
||||
- Fill input fields and textareas with specialized bypasses for Twitter/X, LinkedIn, Facebook
|
||||
- Supports contenteditable elements
|
||||
- Uses platform-specific techniques to avoid bot detection
|
||||
- **element_fill**: Enhanced form filling with anti-detection bypass
|
||||
- Specialized bypasses for Twitter/X, LinkedIn, Facebook
|
||||
- Natural focus sequence: click → focus → fill
|
||||
- Comprehensive event simulation for modern web apps
|
||||
|
||||
- **page_navigate**: Enhanced navigation
|
||||
- Navigate to URLs with optional wait conditions
|
||||
- Wait for specific elements to appear after navigation
|
||||
- **element_get_state**: Get detailed element state information
|
||||
- Check if elements are disabled, clickable, visible
|
||||
- Get current values and element properties
|
||||
- Essential for conditional automation logic
|
||||
|
||||
- **page_navigate**: Enhanced navigation with wait conditions
|
||||
- Navigate to URLs with optional element wait conditions
|
||||
- Timeout handling and error reporting
|
||||
|
||||
- **page_wait_for**: Conditional waiting
|
||||
- **page_wait_for**: Conditional waiting for elements or text
|
||||
- Wait for elements to become visible
|
||||
- Wait for specific text to appear on page
|
||||
- Configurable timeout periods
|
||||
|
||||
- **browser_navigate**: URL navigation
|
||||
- Navigate to URLs in the active tab
|
||||
- Simple navigation tool for compatibility
|
||||
- **page_scroll**: Scroll pages in various directions
|
||||
- Critical for long pages and infinite scroll content
|
||||
- Supports smooth scrolling and element targeting
|
||||
|
||||
### 📑 Tab Management Tools (4 Tools)
|
||||
|
||||
@ -109,12 +114,30 @@ OpenDia is an open alternative to Dia. Connect to your browser with MCP & do any
|
||||
- Focus windows automatically
|
||||
- Essential for multi-tab automation workflows
|
||||
|
||||
### 🔧 State Management Tools (1 Tool)
|
||||
### 📊 Browser Data Access Tools (5 Tools)
|
||||
|
||||
- **element_get_state**: Get detailed state information for elements
|
||||
- Check if elements are disabled, clickable, visible
|
||||
- Get current values and element properties
|
||||
- Essential for conditional automation logic
|
||||
- **get_bookmarks**: Get all bookmarks or search for specific ones
|
||||
- Search bookmarks by query string
|
||||
- Returns structured bookmark data with hierarchy
|
||||
|
||||
- **add_bookmark**: Add new bookmarks
|
||||
- Create bookmarks with title and URL
|
||||
- Optional parent folder support for organization
|
||||
|
||||
- **get_history**: Search browser history with comprehensive filters
|
||||
- Advanced filtering by date, domains, visit count, keywords
|
||||
- Sophisticated sorting and metadata extraction
|
||||
- Perfect for finding previous work and research
|
||||
|
||||
- **get_selected_text**: Get currently selected text on the page
|
||||
- Rich metadata about selection context and position
|
||||
- Includes parent element information and page context
|
||||
- Configurable length limits and truncation
|
||||
|
||||
- **get_page_links**: Get all hyperlinks on current page with filtering
|
||||
- Smart filtering for internal/external links
|
||||
- Domain-specific filtering options
|
||||
- Essential for link analysis and navigation planning
|
||||
|
||||
## 🚀 Key Features
|
||||
|
||||
|
||||
@ -1188,11 +1188,12 @@ class BrowserAutomation {
|
||||
}
|
||||
|
||||
async extractContent({ content_type, max_items = 20, summarize = true }) {
|
||||
console.log(`🔍 extractContent called with: content_type=${content_type}, max_items=${max_items}, summarize=${summarize}`);
|
||||
const startTime = performance.now();
|
||||
const extractors = {
|
||||
article: () => this.extractArticleContent(),
|
||||
search_results: () => this.extractSearchResults(max_items),
|
||||
posts: () => this.extractPosts(max_items),
|
||||
article: () => this.extractArticleContent(summarize),
|
||||
search_results: () => this.extractSearchResults(max_items, summarize),
|
||||
posts: () => this.extractPosts(max_items, summarize),
|
||||
};
|
||||
|
||||
const extractor = extractors[content_type];
|
||||
@ -1218,6 +1219,7 @@ class BrowserAutomation {
|
||||
};
|
||||
} else {
|
||||
// Legacy full content extraction
|
||||
console.log(`🎯 Returning FULL content: ${rawContent?.content?.length || 0} characters`);
|
||||
return {
|
||||
content: rawContent,
|
||||
method: "semantic_extraction",
|
||||
@ -1228,7 +1230,8 @@ class BrowserAutomation {
|
||||
}
|
||||
}
|
||||
|
||||
extractArticleContent() {
|
||||
extractArticleContent(summarize = true) {
|
||||
console.log(`📄 extractArticleContent called with summarize=${summarize}`);
|
||||
const article = document.querySelector(
|
||||
'article, [role="article"], .article-content, main'
|
||||
);
|
||||
@ -1237,6 +1240,9 @@ class BrowserAutomation {
|
||||
?.textContent?.trim();
|
||||
const content = article?.textContent?.trim() || this.extractMainContent();
|
||||
|
||||
console.log(`📏 Extracted content length: ${content?.length || 0} characters`);
|
||||
console.log(`📝 Content preview: ${content?.substring(0, 200)}...`);
|
||||
|
||||
return {
|
||||
title,
|
||||
content,
|
||||
@ -1265,7 +1271,7 @@ class BrowserAutomation {
|
||||
);
|
||||
}
|
||||
|
||||
extractSearchResults(max_items = 20) {
|
||||
extractSearchResults(max_items = 20, summarize = true) {
|
||||
// Common search result patterns
|
||||
const selectors = [
|
||||
'.search-result, .result-item, [data-testid*="result"]',
|
||||
@ -1283,8 +1289,8 @@ class BrowserAutomation {
|
||||
.slice(0, max_items)
|
||||
.map((el, index) => ({
|
||||
index: index + 1,
|
||||
title: this.extractResultTitle(el),
|
||||
summary: this.extractResultSummary(el),
|
||||
title: this.extractResultTitle(el, summarize),
|
||||
summary: this.extractResultSummary(el, summarize),
|
||||
link: this.extractResultLink(el),
|
||||
type: this.detectResultType(el),
|
||||
score: this.scoreSearchResult(el),
|
||||
@ -1296,7 +1302,7 @@ class BrowserAutomation {
|
||||
return results;
|
||||
}
|
||||
|
||||
extractPosts(max_items = 20) {
|
||||
extractPosts(max_items = 20, summarize = true) {
|
||||
// Social media post patterns
|
||||
const selectors = [
|
||||
'[data-testid="tweet"], .tweet, .post',
|
||||
@ -1313,7 +1319,7 @@ class BrowserAutomation {
|
||||
.slice(0, max_items)
|
||||
.map((el, index) => ({
|
||||
index: index + 1,
|
||||
text: this.extractPostText(el),
|
||||
text: this.extractPostText(el, summarize),
|
||||
author: this.extractPostAuthor(el),
|
||||
timestamp: this.extractPostTimestamp(el),
|
||||
metrics: this.extractPostMetrics(el),
|
||||
@ -1405,24 +1411,30 @@ class BrowserAutomation {
|
||||
}
|
||||
|
||||
// Helper methods for extraction
|
||||
extractResultTitle(element) {
|
||||
extractResultTitle(element, summarize = true) {
|
||||
const titleSelectors = [
|
||||
'h1, h2, h3, .title, .headline, [data-testid*="title"]',
|
||||
];
|
||||
for (const selector of titleSelectors) {
|
||||
const title = element.querySelector(selector)?.textContent?.trim();
|
||||
if (title) return title.substring(0, 100);
|
||||
if (title) {
|
||||
return summarize ? title.substring(0, 100) : title;
|
||||
}
|
||||
return element.textContent?.trim()?.substring(0, 50) || "No title";
|
||||
}
|
||||
const fallbackTitle = element.textContent?.trim() || "No title";
|
||||
return summarize ? fallbackTitle.substring(0, 50) : fallbackTitle;
|
||||
}
|
||||
|
||||
extractResultSummary(element) {
|
||||
extractResultSummary(element, summarize = true) {
|
||||
const summarySelectors = [".summary, .description, .snippet, .excerpt"];
|
||||
for (const selector of summarySelectors) {
|
||||
const summary = element.querySelector(selector)?.textContent?.trim();
|
||||
if (summary) return summary.substring(0, 200);
|
||||
if (summary) {
|
||||
return summarize ? summary.substring(0, 200) : summary;
|
||||
}
|
||||
return element.textContent?.trim()?.substring(0, 150) || "";
|
||||
}
|
||||
const fallbackSummary = element.textContent?.trim() || "";
|
||||
return summarize ? fallbackSummary.substring(0, 150) : fallbackSummary;
|
||||
}
|
||||
|
||||
extractResultLink(element) {
|
||||
@ -1453,7 +1465,7 @@ class BrowserAutomation {
|
||||
return Math.min(score, 1.0);
|
||||
}
|
||||
|
||||
extractPostText(element) {
|
||||
extractPostText(element, summarize = true) {
|
||||
const textSelectors = [
|
||||
'[data-testid="tweetText"], .tweet-text',
|
||||
".post-content, .entry-content",
|
||||
@ -1462,10 +1474,13 @@ class BrowserAutomation {
|
||||
|
||||
for (const selector of textSelectors) {
|
||||
const text = element.querySelector(selector)?.textContent?.trim();
|
||||
if (text) return text.substring(0, 280);
|
||||
if (text) {
|
||||
return summarize ? text.substring(0, 280) : text;
|
||||
}
|
||||
}
|
||||
|
||||
return element.textContent?.trim()?.substring(0, 280) || "";
|
||||
const fallbackText = element.textContent?.trim() || "";
|
||||
return summarize ? fallbackText.substring(0, 280) : fallbackText;
|
||||
}
|
||||
|
||||
extractPostAuthor(element) {
|
||||
|
||||
@ -260,11 +260,27 @@ function formatPageAnalyzeResult(result, metadata) {
|
||||
function formatContentExtractionResult(result, metadata) {
|
||||
const contentSummary = `Extracted ${result.content_type} content using ${result.method}:\n\n`;
|
||||
if (result.content) {
|
||||
const preview =
|
||||
typeof result.content === "string"
|
||||
? result.content.substring(0, 500) +
|
||||
(result.content.length > 500 ? "..." : "")
|
||||
: JSON.stringify(result.content, null, 2).substring(0, 500);
|
||||
// Check if this is full content extraction (summarize=false) or summary
|
||||
// If it's a content object with properties, show full content
|
||||
// If it's a string or small content, it's probably summarized
|
||||
let preview;
|
||||
if (typeof result.content === "string") {
|
||||
// String content - likely summarized, keep truncation
|
||||
preview = result.content.substring(0, 500) + (result.content.length > 500 ? "..." : "");
|
||||
} else if (result.content && typeof result.content === "object") {
|
||||
// Object content - check if it's full content extraction
|
||||
if (result.content.content && result.content.content.length > 1000) {
|
||||
// This looks like full content extraction - don't truncate
|
||||
preview = JSON.stringify(result.content, null, 2);
|
||||
} else {
|
||||
// Smaller content, apply truncation
|
||||
preview = JSON.stringify(result.content, null, 2).substring(0, 500);
|
||||
}
|
||||
} else {
|
||||
// Fallback
|
||||
preview = JSON.stringify(result.content, null, 2).substring(0, 500);
|
||||
}
|
||||
|
||||
return `${contentSummary}${preview}\n\n${JSON.stringify(
|
||||
metadata,
|
||||
null,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user