type WebSelectorConfig = {
  delay: number;
  id: string;
  multiple: boolean;
  parentSelectors: string[];
  regex?: string;
  selector: string;
  type: string;
};

type WebScraperConfig = {
  _id: string;
  startUrl: string[];
  selectors: WebSelectorConfig[];
};

const eventFields = [
  'title',
  'image',
  'content_html',
  'perex',
  'date_from',
  'time_from',
  'date_to',
  'time_to',
  'location',
];
const newsFields = ['title', 'image', 'content_html', 'perex', 'date_published', 'time_published'];
type Selector = {
  collect: string[];
  remove: string[];
  regex: string | null | undefined;
  replace: string[][];
};

type NewsSelectors = {
  field: string;
  selector: Selector[];
};

type EventSelectors = {
  field: string;
  selector: Selector[];
};

type News = {
  code: string;
  maxPages?: number;
  nodeSelector?: string[];
  nextPageSelector?: Selector[];
  subpageSelector?: Selector[];
  nodeSelectors?: NewsSelectors[];
  subpageSelectors?: NewsSelectors[];
};
type Events = {
  code: string;
  maxPages?: number;
  nodeSelector?: string[];
  nextPageSelector?: Selector[];
  subpageSelector?: Selector[];
  nodeSelectors?: EventSelectors[];
  subpageSelectors?: EventSelectors[];
};
const parseSelector = (selector: WebSelectorConfig) => {
  if (selector['type'] === 'SelectorLink') {
    selector['selector'] = selector['selector'] + '::attr(href)';
  } else if (selector['type'] === 'SelectorImage') {
    selector['selector'] = selector['selector'] + '::attr(src)';
  }

  const parsedSelector: Selector = {
    collect: selector['selector'].split(', '),
    remove: [],
    regex:
      !('regex' in selector) || ('regex' in selector && !selector['regex'])
        ? null
        : selector['regex'],
    replace: [],
  };
  return parsedSelector;
};
const splitSelectors = (selectors: WebSelectorConfig[]) => {
  const root = selectors.filter((x) => {
    return x.parentSelectors[0].toLowerCase() === '_root';
  });
  const node = selectors.filter((x) => {
    return x.parentSelectors[0].toLowerCase() === 'node';
  });
  const subpage = selectors.filter((x) => {
    return x.parentSelectors[0].toLowerCase() === 'subpage';
  });
  return [root, node, subpage];
};

type Equivalents = {
  content: string;
  published: string;
};
const equivalents: Equivalents = {
  content: 'content_html',
  published: 'date_published',
};

const parseChildNodes = (list: WebSelectorConfig[], parsedInput: News | Events): News | Events => {
  const childNodes = list.filter((x) => {
    return !x.id.endsWith(':rm');
  });
  let specificSelectors: 'nodeSelectors' | 'subpageSelectors';
  childNodes[0].parentSelectors[0].toLowerCase() === 'node'
    ? (specificSelectors = 'nodeSelectors')
    : (specificSelectors = 'subpageSelectors');
  parsedInput[specificSelectors] = [];

  let i = 0;
  for (const childNode of childNodes) {
    if (childNode['id'] === 'subpage') {
      parsedInput.subpageSelector = [parseSelector(childNode)];
    } else {
      let field = childNode['id'];
      if (field in equivalents) field = equivalents[field as keyof Equivalents];
      if (
        (parsedInput.code.endsWith('news') && newsFields.indexOf(field) === -1) ||
        (parsedInput.code.endsWith('events') && !(eventFields.indexOf(field) === -1))
      ) {
        continue;
      }
      parsedInput[specificSelectors]?.push({
        field: field,
        selector: [parseSelector(childNode)],
      });
    }
    const removals = list.filter((x) => {
      return x.id === childNode.id + ':rm';
    });
    if (childNode['id'] === 'subpage' && parsedInput.subpageSelector) {
      removals.length
        ? (parsedInput.subpageSelector[0]['remove'] = removals[0].selector?.split(', '))
        : (parsedInput.subpageSelector[0]['remove'] = []);
    } else {
      removals.length
        ? (parsedInput[specificSelectors]![i].selector[0].remove =
            removals[0].selector?.split(', '))
        : (parsedInput[specificSelectors]![i].selector[0].remove = []);
    }
    i++;
  }
  return parsedInput;
};

const parseExtensionInput = (extensionInput: WebScraperConfig) => {
  let parsedInput: News | Events;
  if (extensionInput['_id'].endsWith('news')) {
    parsedInput = {
      code: extensionInput['_id'],
      maxPages: 2,
    };
  } else if (extensionInput['_id'].endsWith('events')) {
    parsedInput = {
      code: extensionInput['_id'],
      maxPages: 2,
    };
  } else {
    throw new Error('Task code must end with "news" or "events"');
  }
  const [rootSelectors, nodeSelectors, subpageSelectors] = splitSelectors(extensionInput.selectors);

  const parseSelectors = (list: WebSelectorConfig[]) => {
    switch (list[0].parentSelectors[0].toLowerCase()) {
      case '_root': {
        const node = list.filter((x) => {
          return x.id === 'node';
        });
        parsedInput.nodeSelector = [node[0]['selector']];
        const nextPage = list.filter((x) => {
          return x.id === 'next';
        });
        if (nextPage.length) {
          parsedInput.nextPageSelector = [parseSelector(nextPage[0])];
          const removals = list.filter((x) => {
            return x.id === 'next:rm';
          });
          removals.length
            ? (parsedInput.nextPageSelector[0]['remove'] = removals[0].selector?.split(', '))
            : (parsedInput.nextPageSelector[0]['remove'] = []);
        }
        break;
      }
      case 'node':
        parsedInput = parseChildNodes(list, parsedInput);
        break;

      case 'subpage':
        parsedInput = parseChildNodes(list, parsedInput);
        break;

      default:
        throw new Error(`Parent node: ${list[0].parentSelectors[0]} is not supported.`);
    }
  };
  parseSelectors(rootSelectors);
  parseSelectors(nodeSelectors);
  parseSelectors(subpageSelectors);
  return parsedInput;
};

export default parseExtensionInput;
export { News, Events };
