initialised repo

2026-02-02 15:58:45 +00:00
commit b060e7f008
46 changed files with 8574 additions and 0 deletions
--- a/lib/scraper.ts
+++ b/lib/scraper.ts
@@ -0,0 +1,137 @@
+import puppeteer from 'puppeteer'
+import type { ScrapedContent } from './types'
+
+export class ScrapingError extends Error {
+  constructor(message: string, public code: string) {
+    super(message)
+    this.name = 'ScrapingError'
+  }
+}
+
+export async function scrapeWebsite(url: string): Promise<ScrapedContent> {
+  let validatedUrl = url
+  if (!url.startsWith('http')) {
+    validatedUrl = `https://${url}`
+  }
+
+  try {
+    new URL(validatedUrl)
+  } catch {
+    throw new ScrapingError('Invalid URL format. Please enter a valid website URL.', 'INVALID_URL')
+  }
+
+  let browser
+  try {
+    browser = await puppeteer.launch({
+      headless: 'new',
+      args: ['--no-sandbox', '--disable-setuid-sandbox']
+    })
+
+    const page = await browser.newPage()
+    await page.setViewport({ width: 1920, height: 1080 })
+    await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
+    
+    await page.goto(validatedUrl, { 
+      waitUntil: 'networkidle2',
+      timeout: 30000 
+    })
+
+    const extractedContent = await page.evaluate(() => {
+      const title = document.title || document.querySelector('h1')?.textContent || ''
+      
+      const metaDesc = document.querySelector('meta[name="description"]')?.getAttribute('content') || 
+                       document.querySelector('meta[property="og:description"]')?.getAttribute('content') || ''
+      
+      const headings: string[] = []
+      document.querySelectorAll('h1, h2, h3').forEach(el => {
+        const text = el.textContent?.trim()
+        if (text && text.length > 5 && text.length < 200) headings.push(text)
+      })
+      
+      const paragraphs: string[] = []
+      document.querySelectorAll('p').forEach(el => {
+        const text = el.textContent?.trim()
+        if (text && text.length > 30 && text.length < 500 && !text.includes('{')) {
+          paragraphs.push(text)
+        }
+      })
+      
+      const featureList: string[] = []
+      document.querySelectorAll('ul li, ol li').forEach(el => {
+        const text = el.textContent?.trim()
+        if (text && text.length > 10 && text.length < 200) featureList.push(text)
+      })
+      
+      const links: string[] = []
+      document.querySelectorAll('a[href^="/"], a[href^="./"]').forEach(el => {
+        const text = el.textContent?.trim()
+        if (text && text.length > 3 && text.length < 50) links.push(text)
+      })
+
+      // Get all visible text for raw analysis
+      const bodyText = document.body.innerText || ''
+      
+      return {
+        title,
+        metaDescription: metaDesc,
+        headings,
+        paragraphs,
+        featureList,
+        links,
+        rawText: bodyText
+      }
+    })
+
+    return {
+      url: validatedUrl,
+      title: extractedContent.title,
+      metaDescription: extractedContent.metaDescription,
+      headings: [...new Set(extractedContent.headings)].slice(0, 20),
+      paragraphs: [...new Set(extractedContent.paragraphs)].slice(0, 30),
+      featureList: [...new Set(extractedContent.featureList)].slice(0, 20),
+      links: [...new Set(extractedContent.links)].slice(0, 15),
+      rawText: extractedContent.rawText.slice(0, 10000) // Limit raw text
+    }
+
+  } catch (error: any) {
+    console.error('Scraping error:', error)
+    
+    if (error.message?.includes('ERR_NAME_NOT_RESOLVED') || error.message?.includes('net::ERR')) {
+      throw new ScrapingError(
+        `Could not reach ${validatedUrl}. Please check the URL or try entering your product description manually.`,
+        'DNS_ERROR'
+      )
+    }
+    
+    if (error.message?.includes('timeout')) {
+      throw new ScrapingError(
+        'The website took too long to respond. Please try again or enter your product description manually.',
+        'TIMEOUT'
+      )
+    }
+    
+    throw new ScrapingError(
+      'Failed to scrape the website. Please try again or enter your product description manually.',
+      'UNKNOWN'
+    )
+  } finally {
+    if (browser) await browser.close()
+  }
+}
+
+export async function analyzeFromText(
+  productName: string,
+  description: string,
+  features: string
+): Promise<ScrapedContent> {
+  return {
+    url: 'manual-input',
+    title: productName,
+    metaDescription: description,
+    headings: [productName, 'Features', 'Benefits'],
+    paragraphs: [description, features],
+    featureList: features.split('\n').filter(f => f.trim()),
+    links: [],
+    rawText: `${productName}\n\n${description}\n\n${features}`
+  }
+}