import puppeteer from 'puppeteer' import type { ScrapedContent } from './types' import { logServer } from "@/lib/server-logger"; export class ScrapingError extends Error { constructor(message: string, public code: string) { super(message) this.name = 'ScrapingError' } } export async function scrapeWebsite(url: string): Promise { let validatedUrl = url if (!url.startsWith('http')) { validatedUrl = `https://${url}` } try { new URL(validatedUrl) } catch { throw new ScrapingError('Invalid URL format. Please enter a valid website URL.', 'INVALID_URL') } let browser try { browser = await puppeteer.launch({ headless: 'new', args: ['--no-sandbox', '--disable-setuid-sandbox'] }) const page = await browser.newPage() await page.setViewport({ width: 1920, height: 1080 }) await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36') await page.goto(validatedUrl, { waitUntil: 'networkidle2', timeout: 30000 }) const extractedContent = await page.evaluate(() => { const title = document.title || document.querySelector('h1')?.textContent || '' const metaDesc = document.querySelector('meta[name="description"]')?.getAttribute('content') || document.querySelector('meta[property="og:description"]')?.getAttribute('content') || '' const headings: string[] = [] document.querySelectorAll('h1, h2, h3').forEach(el => { const text = el.textContent?.trim() if (text && text.length > 5 && text.length < 200) headings.push(text) }) const paragraphs: string[] = [] document.querySelectorAll('p').forEach(el => { const text = el.textContent?.trim() if (text && text.length > 30 && text.length < 500 && !text.includes('{')) { paragraphs.push(text) } }) const featureList: string[] = [] document.querySelectorAll('ul li, ol li').forEach(el => { const text = el.textContent?.trim() if (text && text.length > 10 && text.length < 200) featureList.push(text) }) const links: string[] = [] document.querySelectorAll('a[href^="/"], a[href^="./"]').forEach(el => { const text = el.textContent?.trim() if (text && text.length > 3 && text.length < 50) links.push(text) }) // Get all visible text for raw analysis const bodyText = document.body.innerText || '' return { title, metaDescription: metaDesc, headings, paragraphs, featureList, links, rawText: bodyText } }) return { url: validatedUrl, title: extractedContent.title, metaDescription: extractedContent.metaDescription, headings: [...new Set(extractedContent.headings)].slice(0, 20), paragraphs: [...new Set(extractedContent.paragraphs)].slice(0, 30), featureList: [...new Set(extractedContent.featureList)].slice(0, 20), links: [...new Set(extractedContent.links)].slice(0, 15), rawText: extractedContent.rawText.slice(0, 10000) // Limit raw text } } catch (error: any) { await logServer({ level: "error", message: "Scraping error", labels: ["scraper", "error"], payload: { url: validatedUrl, error: String(error) }, source: "lib/scraper", }); if (error.message?.includes('ERR_NAME_NOT_RESOLVED') || error.message?.includes('net::ERR')) { throw new ScrapingError( `Could not reach ${validatedUrl}. Please check the URL or try entering your product description manually.`, 'DNS_ERROR' ) } if (error.message?.includes('timeout')) { throw new ScrapingError( 'The website took too long to respond. Please try again or enter your product description manually.', 'TIMEOUT' ) } throw new ScrapingError( 'Failed to scrape the website. Please try again or enter your product description manually.', 'UNKNOWN' ) } finally { if (browser) await browser.close() } } export async function analyzeFromText( productName: string, description: string, features: string ): Promise { return { url: 'manual-input', title: productName, metaDescription: description, headings: [productName, 'Features', 'Benefits'], paragraphs: [description, features], featureList: features.split('\n').filter(f => f.trim()), links: [], rawText: `${productName}\n\n${description}\n\n${features}` } }