refactor: reorganize scripts - move debug code to experiments/
- Move debug/test scripts from src/scripts/ to experiments/ - Remove test-detail-endpoint from package.json - Delete temp-product-page.html - Move E2E_GUIDE.md to docs/ - Add experiments/README.md with documentation - Keep only production scripts in src/scripts/ - Clean up tsconfig.json exclude list (experiments are now outside src/) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
116
experiments/html-extraction/extract-product-from-html.ts
Normal file
116
experiments/html-extraction/extract-product-from-html.ts
Normal file
@@ -0,0 +1,116 @@
|
||||
import 'dotenv/config';
|
||||
import { chromium } from 'playwright';
|
||||
import * as fs from 'fs';
|
||||
import { Logger } from '../utils/logger.js';
|
||||
|
||||
async function main() {
|
||||
Logger.info('=== Извлечение данных о товаре из HTML ===\n');
|
||||
|
||||
const browser = await chromium.launch({ headless: true });
|
||||
const context = await browser.newContext();
|
||||
const page = await context.newPage();
|
||||
|
||||
const productUrl = 'https://magnit.ru/product/1000233138-podguzniki_la_fresh_dlya_vzroslykh_l_10sht?shopCode=992301&shopType=6';
|
||||
|
||||
Logger.info(`Загружаю страницу: ${productUrl}`);
|
||||
|
||||
await page.goto(productUrl, {
|
||||
waitUntil: 'domcontentloaded',
|
||||
timeout: 20000,
|
||||
});
|
||||
|
||||
await page.waitForTimeout(3000);
|
||||
|
||||
// Извлекаем данные из HTML
|
||||
const productData = await page.evaluate(() => {
|
||||
const result: any = {
|
||||
title: document.querySelector('h1')?.textContent?.trim() || '',
|
||||
// Ищем brand, description, weight в разных местах
|
||||
};
|
||||
|
||||
// 1. Ищем в meta тегах
|
||||
const metaBrand = document.querySelector('meta[itemprop="brand"]')?.content;
|
||||
const metaDesc = document.querySelector('meta[itemprop="description"]')?.content;
|
||||
const metaWeight = document.querySelector('meta[itemprop="weight"]')?.content;
|
||||
|
||||
// 2. Ищем в JSON-LD structured data
|
||||
const jsonLdScripts = Array.from(document.querySelectorAll('script[type="application/ld+json"]'));
|
||||
for (const script of jsonLdScripts) {
|
||||
try {
|
||||
const json = JSON.parse(script.textContent || '');
|
||||
if (json['@type'] === 'Product' || json.name === 'Product') {
|
||||
result.jsonLd = json;
|
||||
break;
|
||||
}
|
||||
} catch (e) {}
|
||||
}
|
||||
|
||||
// 3. Ищем в window объектах
|
||||
const nuxtData = (window as any).__NUXT__;
|
||||
if (nuxtData) {
|
||||
result.nuxtKeys = Object.keys(nuxtData);
|
||||
// Проверяем все возможные места с данными о товаре
|
||||
for (const key of Object.keys(nuxtData)) {
|
||||
const val = nuxtData[key];
|
||||
if (val && typeof val === 'object') {
|
||||
const str = JSON.stringify(val);
|
||||
if (str.includes('brand') || str.includes('description') || str.includes('weight')) {
|
||||
result.nuxtDataKey = key;
|
||||
result.nuxtDataPreview = str.substring(0, 500);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Ищем в других script тегах
|
||||
const allScripts = Array.from(document.querySelectorAll('script'));
|
||||
for (const script of allScripts) {
|
||||
const text = script.textContent || '';
|
||||
if (text.includes('"brand"') && text.length > 100 && text.length < 100000) {
|
||||
try {
|
||||
// Попробуем найти JSON
|
||||
const match = text.match(/\{[\s\S]*\}/);
|
||||
if (match) {
|
||||
try {
|
||||
const json = JSON.parse(match[0]);
|
||||
if (json.brand || json.description || json.weight) {
|
||||
result.foundInScript = true;
|
||||
result.scriptDataPreview = JSON.stringify(json).substring(0, 500);
|
||||
break;
|
||||
}
|
||||
} catch (e2) {}
|
||||
}
|
||||
} catch (e) {}
|
||||
}
|
||||
}
|
||||
|
||||
// 5. Ищем в data-атрибутах
|
||||
const productElement = document.querySelector('[data-product-id], [data-product], [id*="product"]');
|
||||
if (productElement) {
|
||||
result.productElement = productElement.outerHTML.substring(0, 500);
|
||||
}
|
||||
|
||||
// 6. Проверяем структурированные данные
|
||||
result.structuredData = {
|
||||
metaBrand,
|
||||
metaDesc,
|
||||
metaWeight,
|
||||
};
|
||||
|
||||
return result;
|
||||
});
|
||||
|
||||
Logger.info('=== РЕЗУЛЬТАТЫ ===\n');
|
||||
Logger.info(JSON.stringify(productData, null, 2));
|
||||
|
||||
// Также сохраним HTML для анализа
|
||||
const html = await page.content();
|
||||
const outputPath = 'temp-product-page.html';
|
||||
fs.writeFileSync(outputPath, html, 'utf-8');
|
||||
Logger.info(`\nHTML сохранен в: ${outputPath}`);
|
||||
|
||||
await browser.close();
|
||||
}
|
||||
|
||||
main();
|
||||
Reference in New Issue
Block a user