hexchat/src/common/url.c

613 lines
14 KiB
C
Raw Normal View History

2011-02-24 06:14:30 +03:00
/* X-Chat
* Copyright (C) 1998 Peter Zelezny.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
2013-01-03 02:58:26 +04:00
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
2011-02-24 06:14:30 +03:00
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
2012-10-24 23:33:02 +04:00
#include "hexchat.h"
#include "hexchatc.h"
2011-02-24 06:14:30 +03:00
#include "cfgfiles.h"
#include "fe.h"
#include "tree.h"
#include "url.h"
#ifdef HAVE_STRINGS_H
#include <strings.h>
#endif
void *url_tree = NULL;
GTree *url_btree = NULL;
static int do_an_re (const char *word, int *start, int *end, int *type);
static GRegex *re_url (void);
static GRegex *re_host (void);
static GRegex *re_email (void);
static GRegex *re_nick (void);
static GRegex *re_channel (void);
static GRegex *re_path (void);
2011-02-24 06:14:30 +03:00
static int
url_free (char *url, void *data)
{
free (url);
return TRUE;
}
void
url_clear (void)
{
tree_foreach (url_tree, (tree_traverse_func *)url_free, NULL);
tree_destroy (url_tree);
url_tree = NULL;
g_tree_destroy (url_btree);
url_btree = NULL;
2011-02-24 06:14:30 +03:00
}
static int
url_save_cb (char *url, FILE *fd)
{
fprintf (fd, "%s\n", url);
return TRUE;
}
void
url_save_tree (const char *fname, const char *mode, gboolean fullpath)
2011-02-24 06:14:30 +03:00
{
FILE *fd;
if (fullpath)
2012-10-30 14:35:39 +04:00
fd = hexchat_fopen_file (fname, mode, XOF_FULLPATH);
2011-02-24 06:14:30 +03:00
else
2012-10-30 14:35:39 +04:00
fd = hexchat_fopen_file (fname, mode, 0);
2011-02-24 06:14:30 +03:00
if (fd == NULL)
return;
tree_foreach (url_tree, (tree_traverse_func *)url_save_cb, fd);
fclose (fd);
}
static void
url_save_node (char* url)
2011-02-24 06:14:30 +03:00
{
FILE *fd;
/* open <config>/url.log in append mode */
2012-10-30 14:35:39 +04:00
fd = hexchat_fopen_file ("url.log", "a", 0);
if (fd == NULL)
{
return;
}
fprintf (fd, "%s\n", url);
fclose (fd);
2011-02-24 06:14:30 +03:00
}
static int
url_find (char *urltext)
{
return (g_tree_lookup_extended (url_btree, urltext, NULL, NULL));
2011-02-24 06:14:30 +03:00
}
static void
url_add (char *urltext, int len)
{
2012-03-16 02:58:52 +04:00
char *data;
int size;
/* we don't need any URLs if we have neither URL grabbing nor URL logging enabled */
2012-10-22 17:55:43 +04:00
if (!prefs.hex_url_grabber && !prefs.hex_url_logging)
{
2012-03-16 02:58:52 +04:00
return;
}
2012-03-16 02:58:52 +04:00
data = malloc (len + 1);
2011-02-24 06:14:30 +03:00
if (!data)
{
2011-02-24 06:14:30 +03:00
return;
}
2011-02-24 06:14:30 +03:00
memcpy (data, urltext, len);
data[len] = 0;
if (data[len - 1] == '.') /* chop trailing dot */
{
len--;
data[len] = 0;
}
/* chop trailing ) but only if there's no counterpart */
if (data[len - 1] == ')' && strchr (data, '(') == NULL)
{
2011-02-24 06:14:30 +03:00
data[len - 1] = 0;
}
2012-10-22 17:55:43 +04:00
if (prefs.hex_url_logging)
{
url_save_node (data);
}
/* the URL is saved already, only continue if we need the URL grabber too */
2012-10-22 17:55:43 +04:00
if (!prefs.hex_url_grabber)
{
free (data);
return;
}
2011-02-24 06:14:30 +03:00
if (!url_tree)
{
url_tree = tree_new ((tree_cmp_func *)strcasecmp, NULL);
url_btree = g_tree_new ((GCompareFunc)strcasecmp);
}
2011-02-24 06:14:30 +03:00
if (url_find (data))
{
free (data);
return;
}
2012-03-16 02:58:52 +04:00
size = tree_size (url_tree);
/* 0 is unlimited */
2012-10-22 17:55:43 +04:00
if (prefs.hex_url_grabber_limit > 0 && size >= prefs.hex_url_grabber_limit)
2012-03-16 02:58:52 +04:00
{
/* the loop is necessary to handle having the limit lowered while
2012-10-30 14:35:39 +04:00
HexChat is running */
2012-10-22 17:55:43 +04:00
size -= prefs.hex_url_grabber_limit;
2012-03-16 02:58:52 +04:00
for(; size > 0; size--)
{
char *pos;
pos = tree_remove_at_pos (url_tree, 0);
g_tree_remove (url_btree, pos);
free (pos);
}
2012-03-16 02:58:52 +04:00
}
tree_append (url_tree, data);
g_tree_insert (url_btree, data, GINT_TO_POINTER (tree_size (url_tree) - 1));
2011-02-24 06:14:30 +03:00
fe_url_add (data);
}
/* check if a word is clickable. This is called on mouse motion events, so
keep it FAST! This new version was found to be almost 3x faster than
2.4.4 release. */
static int laststart = 0;
static int lastend = 0;
static int lasttype = 0;
2013-01-11 13:39:21 +04:00
static int
strchrs (char c, char *s)
{
while (*s)
if (c == *s++)
return TRUE;
return FALSE;
}
#define NICKPRE "~+!@%%&"
2011-02-24 06:14:30 +03:00
int
url_check_word (const char *word)
2011-02-24 06:14:30 +03:00
{
laststart = lastend = lasttype = 0;
if (do_an_re (word, &laststart, &lastend, &lasttype))
2011-02-24 06:14:30 +03:00
{
switch (lasttype)
2011-02-24 06:14:30 +03:00
{
2013-01-11 13:39:21 +04:00
char *str;
case WORD_NICK:
2013-01-11 13:39:21 +04:00
if (strchrs (word[laststart], NICKPRE))
laststart++;
str = g_strndup (&word[laststart], lastend - laststart);
2013-01-11 13:39:21 +04:00
if (!userlist_find (current_sess, str))
lasttype = 0;
g_free (str);
return lasttype;
case WORD_EMAIL:
if (!isalnum (word[laststart]))
laststart++;
/* Fall through */
case WORD_URL:
case WORD_HOST:
case WORD_CHANNEL:
case WORD_PATH:
return lasttype;
default:
return 0; /* Should not occur */
2011-02-24 06:14:30 +03:00
}
}
else
return 0;
2011-02-24 06:14:30 +03:00
}
/* List of IRC commands for which contents (and thus possible URLs)
* are visible to the user. NOTE: Trailing blank required in each. */
static char *commands[] = {
"NOTICE ",
"PRIVMSG ",
"TOPIC ",
"332 ", /* RPL_TOPIC */
"372 " /* RPL_MOTD */
};
#define ARRAY_SIZE(a) (sizeof (a) / sizeof ((a)[0]))
2011-02-24 06:14:30 +03:00
void
url_check_line (char *buf, int len)
{
GRegex *re(void);
GMatchInfo *gmi;
2011-02-24 06:14:30 +03:00
char *po = buf;
int i;
/* Skip over message prefix */
if (*po == ':')
{
po = strchr (po, ' ');
if (!po)
return;
po++;
}
/* Allow only commands from the above list */
for (i = 0; i < ARRAY_SIZE (commands); i++)
{
char *cmd = commands[i];
int len = strlen (cmd);
if (strncmp (cmd, po, len) == 0)
{
po += len;
break;
}
}
if (i == ARRAY_SIZE (commands))
return;
/* Skip past the channel name or user nick */
po = strchr (po, ' ');
if (!po)
return;
po++;
2011-02-24 06:14:30 +03:00
g_regex_match(re_url(), po, 0, &gmi);
while (g_match_info_matches(gmi))
{
int start, end;
g_match_info_fetch_pos(gmi, 0, &start, &end);
while (end > start && (po[end - 1] == '\r' || po[end - 1] == '\n'))
end--;
if (g_strstr_len (po + start, end - start, "://"))
url_add(po + start, end - start);
g_match_info_next(gmi, NULL);
}
g_match_info_free(gmi);
}
int
url_last (int *lstart, int *lend)
{
*lstart = laststart;
*lend = lastend;
return lasttype;
}
static int
do_an_re(const char *word,int *start, int *end, int *type)
{
typedef struct func_s {
GRegex *(*fn)(void);
int type;
} func_t;
func_t funcs[] =
{
{ re_url, WORD_URL },
{ re_email, WORD_EMAIL },
{ re_channel, WORD_CHANNEL },
{ re_host, WORD_HOST },
{ re_path, WORD_PATH },
{ re_nick, WORD_NICK }
};
2011-02-24 06:14:30 +03:00
GMatchInfo *gmi;
int k;
2011-02-24 06:14:30 +03:00
for (k = 0; k < sizeof funcs / sizeof (func_t); k++)
2011-02-24 06:14:30 +03:00
{
g_regex_match (funcs[k].fn(), word, 0, &gmi);
if (!g_match_info_matches (gmi))
2011-02-24 06:14:30 +03:00
{
g_match_info_free (gmi);
continue;
}
while (g_match_info_matches (gmi))
{
g_match_info_fetch_pos (gmi, 0, start, end);
g_match_info_next (gmi, NULL);
2011-02-24 06:14:30 +03:00
}
g_match_info_free (gmi);
*type = funcs[k].type;
return TRUE;
2011-02-24 06:14:30 +03:00
}
return FALSE;
}
/* Miscellaneous description --- */
#define DOMAIN "[a-z0-9][-a-z0-9]*(\\.[-a-z0-9]+)*\\."
#define TLD "[a-z][-a-z0-9]*[a-z]"
#define IPADDR "[0-9]{1,3}(\\.[0-9]{1,3}){3}"
#define IPV6ADDR "([0-9a-f]{0,4}(:[0-9a-f]{0,4})*:){2}[0-9a-f]{0,4}(:[0-9a-f]{0,4})*"
#define HOST "(" DOMAIN TLD "|" IPADDR "|" IPV6ADDR ")"
#define OPT_PORT "(:[1-9][0-9]{0,4})?"
GRegex *
make_re(char *grist, char *type)
{
GRegex *ret;
GError *err = NULL;
ret = g_regex_new (grist, G_REGEX_CASELESS | G_REGEX_OPTIMIZE, 0, &err);
g_free (grist);
return ret;
}
/* HOST description --- */
/* (see miscellaneous above) */
static GRegex *
re_host (void)
{
static GRegex *host_ret;
char *grist;
if (host_ret) return host_ret;
grist = g_strdup_printf (
"(" /* HOST */
HOST OPT_PORT
")"
);
host_ret = make_re (grist, "re_host");
return host_ret;
}
/* URL description --- */
#define SCHEME "(%s)"
#define LPAR "\\("
#define RPAR "\\)"
#define NOPARENS "[^() \t]*"
#define PATH \
"(" \
"(" LPAR NOPARENS RPAR ")" \
"|" \
"(" NOPARENS ")" \
")*" /* Zero or more occurrences of either of these */ \
"(?<![.,?!\\]])" /* Not allowed to end with these */
#define USERINFO "([-a-z0-9._~%]+(:[-a-z0-9._~%]*)?@)"
/* Flags used to describe URIs (RFC 3986)
*
* Bellow is an example of what the flags match.
*
* URI_AUTHORITY - http://example.org:80/foo/bar
* ^^^^^^^^^^^^^^^^
* URI_USERINFO/URI_OPT_USERINFO - http://user@example.org:80/foo/bar
* ^^^^^
* URI_PATH - http://example.org:80/foo/bar
* ^^^^^^^^
*/
#define URI_AUTHORITY (1 << 0)
#define URI_OPT_USERINFO (1 << 1)
#define URI_USERINFO (1 << 2)
#define URI_PATH (1 << 3)
struct
{
const char *scheme; /* scheme name. e.g. http */
const char *path_sep; /* string that begins the path */
2013-06-16 22:09:50 +04:00
int flags; /* see above (flag macros) */
} uri[] = {
{ "irc", "/", URI_PATH },
{ "ircs", "/", URI_PATH },
{ "rtsp", "/", URI_AUTHORITY | URI_PATH },
{ "feed", "/", URI_AUTHORITY | URI_PATH },
{ "teamspeak", "?", URI_AUTHORITY | URI_PATH },
{ "ftp", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
{ "sftp", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
{ "ftps", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
{ "http", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
{ "https", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
{ "cvs", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
{ "svn", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
{ "git", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
{ "rsync", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
{ "mumble", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
{ "ventrilo", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
{ "xmpp", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
{ "h323", ";", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
{ "imap", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
{ "pop", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
{ "nfs", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
{ "smb", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
{ "ssh", "", URI_AUTHORITY | URI_OPT_USERINFO },
{ "sip", "", URI_AUTHORITY | URI_USERINFO },
{ "sips", "", URI_AUTHORITY | URI_USERINFO },
{ "magnet", "?", URI_PATH },
{ "mailto", "", URI_PATH },
{ "bitcoin", "", URI_PATH },
{ "gtalk", "", URI_PATH },
{ "steam", "", URI_PATH },
2013-06-16 22:09:50 +04:00
{ "file", "/", URI_PATH },
{ NULL, "", 0}
};
static GRegex *
re_url (void)
{
static GRegex *url_ret = NULL;
GString *grist_gstr;
char *grist;
int i;
if (url_ret) return url_ret;
grist_gstr = g_string_new (NULL);
/* Add regex "host/path", representing a "schemeless" url */
g_string_append (grist_gstr, "(" HOST OPT_PORT "/" "(" PATH ")?" ")");
for (i = 0; uri[i].scheme; i++)
{
g_string_append (grist_gstr, "|(");
g_string_append_printf (grist_gstr, "%s:", uri[i].scheme);
if (uri[i].flags & URI_AUTHORITY)
g_string_append (grist_gstr, "//");
if (uri[i].flags & URI_USERINFO)
g_string_append (grist_gstr, USERINFO);
else if (uri[i].flags & URI_OPT_USERINFO)
g_string_append (grist_gstr, USERINFO "?");
if (uri[i].flags & URI_AUTHORITY)
g_string_append (grist_gstr, HOST OPT_PORT);
if (uri[i].flags & URI_PATH)
{
char *sep_escaped;
sep_escaped = g_regex_escape_string (uri[i].path_sep,
strlen(uri[i].path_sep));
g_string_append_printf(grist_gstr, "(" "%s" PATH ")?",
sep_escaped);
g_free(sep_escaped);
}
g_string_append(grist_gstr, ")");
}
grist = g_string_free (grist_gstr, FALSE);
url_ret = make_re (grist, "re_url");
return url_ret;
}
/* EMAIL description --- */
#define EMAIL "[a-z][-_a-z0-9]+@" "(" HOST ")"
static GRegex *
re_email (void)
{
static GRegex *email_ret;
char *grist;
if (email_ret) return email_ret;
grist = g_strdup_printf (
"(" /* EMAIL */
EMAIL
")"
);
email_ret = make_re (grist, "re_email");
return email_ret;
}
/* NICK description --- */
2013-01-11 13:39:21 +04:00
/* For NICKPRE see before url_check_word() */
#define NICKHYP "-"
#define NICKLET "a-z"
#define NICKDIG "0-9"
/* Note for NICKSPE: \\\\ boils down to a single \ */
#define NICKSPE "\\[\\]\\\\`_^{|}"
#if 0
2013-01-04 02:13:20 +04:00
#define NICK0 "[" NICKPRE "]?[" NICKLET NICKSPE "]"
#else
/* Allow violation of rfc 2812 by allowing digit as first char */
/* Rationale is that do_an_re() above will anyway look up what */
/* we find, and that WORD_NICK is the last item in the array */
/* that do_an_re() runs through. */
#define NICK0 "[" NICKPRE "]?[" NICKLET NICKDIG NICKSPE "]"
#endif
#define NICK1 "[" NICKHYP NICKLET NICKDIG NICKSPE "]*"
#define NICK NICK0 NICK1
static GRegex *
re_nick (void)
{
static GRegex *nick_ret;
char *grist;
if (nick_ret) return nick_ret;
grist = g_strdup_printf (
"(" /* NICK */
NICK
")"
);
nick_ret = make_re (grist, "re_nick");
return nick_ret;
}
/* CHANNEL description --- */
#define CHANNEL "#[^ \t\a,:]+"
static GRegex *
re_channel (void)
{
static GRegex *channel_ret;
char *grist;
if (channel_ret) return channel_ret;
grist = g_strdup_printf (
"(" /* CHANNEL */
CHANNEL
")"
);
channel_ret = make_re (grist, "re_channel");
return channel_ret;
}
/* PATH description --- */
#ifdef WIN32
2013-03-11 20:55:29 +04:00
/* Windows path can be .\ ..\ or C: D: etc */
#define FS_PATH "^(\\.{1,2}\\\\|[a-z]:).*"
#else
/* Linux path can be / or ./ or ../ etc */
#define FS_PATH "^(/|\\./|\\.\\./).*"
#endif
static GRegex *
re_path (void)
{
static GRegex *path_ret;
char *grist;
if (path_ret) return path_ret;
grist = g_strdup_printf (
"(" /* FS_PATH */
FS_PATH
")"
);
path_ret = make_re (grist, "re_path");
return path_ret;
2011-02-24 06:14:30 +03:00
}