Files
supermarket/experiments/html-extraction/extract-product-from-html.ts
Mc Smog b8f170d83b fix: update import paths in debug scripts after reorganization
- Fix relative imports in experiments/ scripts (../ → ../../)
- Clean up tsconfig.json exclude list (remove non-existent paths)
- All debug scripts now work from their new location

Co-Authored-By: Claude <noreply@anthropic.com>
2026-01-22 02:02:52 +05:00

117 lines
3.9 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import 'dotenv/config';
import { chromium } from 'playwright';
import * as fs from 'fs';
import { Logger } from '../../utils/logger.js';
async function main() {
Logger.info('=== Извлечение данных о товаре из HTML ===\n');
const browser = await chromium.launch({ headless: true });
const context = await browser.newContext();
const page = await context.newPage();
const productUrl = 'https://magnit.ru/product/1000233138-podguzniki_la_fresh_dlya_vzroslykh_l_10sht?shopCode=992301&shopType=6';
Logger.info(`Загружаю страницу: ${productUrl}`);
await page.goto(productUrl, {
waitUntil: 'domcontentloaded',
timeout: 20000,
});
await page.waitForTimeout(3000);
// Извлекаем данные из HTML
const productData = await page.evaluate(() => {
const result: any = {
title: document.querySelector('h1')?.textContent?.trim() || '',
// Ищем brand, description, weight в разных местах
};
// 1. Ищем в meta тегах
const metaBrand = document.querySelector('meta[itemprop="brand"]')?.content;
const metaDesc = document.querySelector('meta[itemprop="description"]')?.content;
const metaWeight = document.querySelector('meta[itemprop="weight"]')?.content;
// 2. Ищем в JSON-LD structured data
const jsonLdScripts = Array.from(document.querySelectorAll('script[type="application/ld+json"]'));
for (const script of jsonLdScripts) {
try {
const json = JSON.parse(script.textContent || '');
if (json['@type'] === 'Product' || json.name === 'Product') {
result.jsonLd = json;
break;
}
} catch (e) {}
}
// 3. Ищем в window объектах
const nuxtData = (window as any).__NUXT__;
if (nuxtData) {
result.nuxtKeys = Object.keys(nuxtData);
// Проверяем все возможные места с данными о товаре
for (const key of Object.keys(nuxtData)) {
const val = nuxtData[key];
if (val && typeof val === 'object') {
const str = JSON.stringify(val);
if (str.includes('brand') || str.includes('description') || str.includes('weight')) {
result.nuxtDataKey = key;
result.nuxtDataPreview = str.substring(0, 500);
break;
}
}
}
}
// 4. Ищем в других script тегах
const allScripts = Array.from(document.querySelectorAll('script'));
for (const script of allScripts) {
const text = script.textContent || '';
if (text.includes('"brand"') && text.length > 100 && text.length < 100000) {
try {
// Попробуем найти JSON
const match = text.match(/\{[\s\S]*\}/);
if (match) {
try {
const json = JSON.parse(match[0]);
if (json.brand || json.description || json.weight) {
result.foundInScript = true;
result.scriptDataPreview = JSON.stringify(json).substring(0, 500);
break;
}
} catch (e2) {}
}
} catch (e) {}
}
}
// 5. Ищем в data-атрибутах
const productElement = document.querySelector('[data-product-id], [data-product], [id*="product"]');
if (productElement) {
result.productElement = productElement.outerHTML.substring(0, 500);
}
// 6. Проверяем структурированные данные
result.structuredData = {
metaBrand,
metaDesc,
metaWeight,
};
return result;
});
Logger.info('=== РЕЗУЛЬТАТЫ ===\n');
Logger.info(JSON.stringify(productData, null, 2));
// Также сохраним HTML для анализа
const html = await page.content();
const outputPath = 'temp-product-page.html';
fs.writeFileSync(outputPath, html, 'utf-8');
Logger.info(`\nHTML сохранен в: ${outputPath}`);
await browser.close();
}
main();