Files
supermarket/experiments/html-extraction/extract-product-from-html.ts
Mc Smog dd4c64c601 refactor: reorganize scripts - move debug code to experiments/
- Move debug/test scripts from src/scripts/ to experiments/
- Remove test-detail-endpoint from package.json
- Delete temp-product-page.html
- Move E2E_GUIDE.md to docs/
- Add experiments/README.md with documentation
- Keep only production scripts in src/scripts/
- Clean up tsconfig.json exclude list (experiments are now outside src/)

Co-Authored-By: Claude <noreply@anthropic.com>
2026-01-22 01:55:20 +05:00

117 lines
3.9 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import 'dotenv/config';
import { chromium } from 'playwright';
import * as fs from 'fs';
import { Logger } from '../utils/logger.js';
async function main() {
Logger.info('=== Извлечение данных о товаре из HTML ===\n');
const browser = await chromium.launch({ headless: true });
const context = await browser.newContext();
const page = await context.newPage();
const productUrl = 'https://magnit.ru/product/1000233138-podguzniki_la_fresh_dlya_vzroslykh_l_10sht?shopCode=992301&shopType=6';
Logger.info(`Загружаю страницу: ${productUrl}`);
await page.goto(productUrl, {
waitUntil: 'domcontentloaded',
timeout: 20000,
});
await page.waitForTimeout(3000);
// Извлекаем данные из HTML
const productData = await page.evaluate(() => {
const result: any = {
title: document.querySelector('h1')?.textContent?.trim() || '',
// Ищем brand, description, weight в разных местах
};
// 1. Ищем в meta тегах
const metaBrand = document.querySelector('meta[itemprop="brand"]')?.content;
const metaDesc = document.querySelector('meta[itemprop="description"]')?.content;
const metaWeight = document.querySelector('meta[itemprop="weight"]')?.content;
// 2. Ищем в JSON-LD structured data
const jsonLdScripts = Array.from(document.querySelectorAll('script[type="application/ld+json"]'));
for (const script of jsonLdScripts) {
try {
const json = JSON.parse(script.textContent || '');
if (json['@type'] === 'Product' || json.name === 'Product') {
result.jsonLd = json;
break;
}
} catch (e) {}
}
// 3. Ищем в window объектах
const nuxtData = (window as any).__NUXT__;
if (nuxtData) {
result.nuxtKeys = Object.keys(nuxtData);
// Проверяем все возможные места с данными о товаре
for (const key of Object.keys(nuxtData)) {
const val = nuxtData[key];
if (val && typeof val === 'object') {
const str = JSON.stringify(val);
if (str.includes('brand') || str.includes('description') || str.includes('weight')) {
result.nuxtDataKey = key;
result.nuxtDataPreview = str.substring(0, 500);
break;
}
}
}
}
// 4. Ищем в других script тегах
const allScripts = Array.from(document.querySelectorAll('script'));
for (const script of allScripts) {
const text = script.textContent || '';
if (text.includes('"brand"') && text.length > 100 && text.length < 100000) {
try {
// Попробуем найти JSON
const match = text.match(/\{[\s\S]*\}/);
if (match) {
try {
const json = JSON.parse(match[0]);
if (json.brand || json.description || json.weight) {
result.foundInScript = true;
result.scriptDataPreview = JSON.stringify(json).substring(0, 500);
break;
}
} catch (e2) {}
}
} catch (e) {}
}
}
// 5. Ищем в data-атрибутах
const productElement = document.querySelector('[data-product-id], [data-product], [id*="product"]');
if (productElement) {
result.productElement = productElement.outerHTML.substring(0, 500);
}
// 6. Проверяем структурированные данные
result.structuredData = {
metaBrand,
metaDesc,
metaWeight,
};
return result;
});
Logger.info('=== РЕЗУЛЬТАТЫ ===\n');
Logger.info(JSON.stringify(productData, null, 2));
// Также сохраним HTML для анализа
const html = await page.content();
const outputPath = 'temp-product-page.html';
fs.writeFileSync(outputPath, html, 'utf-8');
Logger.info(`\nHTML сохранен в: ${outputPath}`);
await browser.close();
}
main();