/*
 * countWords function counts the number of words in a given string.
 * If Intl.Segmenter is available, it will use it to count words, unless the content contains Chinese or Japanese ideographs.
 * The fallback version uses a regular expression to split the content into words.
 * For languages that don't use spaces, any character will be considered a word in the fallback version
 */
import { isFunction } from '@/domain/utils/TypePredicates';

function countWords(content: string) {
	// Check if the content contains any Chinese and Japanese ideographs
	// https://stackoverflow.com/questions/43418812/check-whether-a-string-contains-japanese-chinese-characters#answer-43419070
	const hasIdeographs = /[\u3040-\u30ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff66-\uff9f]/.test(content);

	// replace urls with placeholders in order not to count them as separate words
	const urlPattern = /(https?:\/\/)?([a-zA-Z0-9-]+\.)?([a-zA-Z0-9-]+\.[a-zA-Z]{2,})(\/?[^\s]*)?/gi;
	const contentWithUrlPlaceholders = content.replace(urlPattern, 'URL_PLACEHOLDER');

	if (!hasIdeographs && isFunction(Intl.Segmenter)) {
		const segmenter = new Intl.Segmenter(undefined, { granularity: 'word' });

		// replace nbsp with space and hyphens with empty string
		const cleanedContent = contentWithUrlPlaceholders.replace(/&nbsp;/g, ' ').replace(/-/g, '');
		const iterator = segmenter.segment(cleanedContent)[Symbol.iterator]();

		let wordCounter = 0;

		for (const segment of iterator) {
			if (segment.isWordLike) {
				wordCounter++;
			}
		}

		return wordCounter;
	} else {
		// Use space and punctuation as word boundaries. For languages that don't use spaces we will consider any character as a word
		const regex = new RegExp(
			[
				'&nbsp;', // Match non-breaking spaces
				'(^|\\s+)[!\'"#$%&()*+,./\\\\:;<=>?@[\\]^_`{|}~«»\\u2000-\\u206F\\u2E00-\\u2E7F\\s]+(\\s+|$)', // Match punctuation and spaces
				'\\s+', // Match any whitespace
				// Match specific scripts (Chinese, Japanese, Thai, Korean).
				'(?=[\\p{Script=Han}\\p{Script=Hiragana}\\p{Script=Katakana}\\p{Script=Thai}\\p{Script=Hangul}])',
			].join('|'),
			'u'
		);
		return contentWithUrlPlaceholders.split(regex).filter((value) => value && value.trim()).length;
	}
}

function countCharacters(content: string) {
	content = content.trim().replace(/&nbsp;/g, ' ');

	return content.length ? content.match(/./g)?.length : 0;
}

export { countWords, countCharacters };
