sublime-wakatime/packages/wakatime/stats.py

389 lines
11 KiB
Python
Raw Normal View History

2013-09-23 00:51:23 +04:00
# -*- coding: utf-8 -*-
"""
wakatime.stats
~~~~~~~~~~~~~~
Stats about files
:copyright: (c) 2013 Alan Hamlett.
:license: BSD, see LICENSE for more details.
"""
import logging
import os
2017-03-02 09:30:57 +03:00
import re
2013-09-23 00:51:23 +04:00
import sys
2014-09-30 20:27:35 +04:00
from .compat import u, open
2017-05-25 09:53:28 +03:00
from .constants import MAX_FILE_SIZE_SUPPORTED
2015-09-29 13:11:25 +03:00
from .dependencies import DependencyParser
2018-03-15 11:31:17 +03:00
from .exceptions import SkipHeartbeat
2017-03-16 18:26:24 +03:00
from .language_priorities import LANGUAGES
2014-09-30 20:27:35 +04:00
2017-02-21 03:18:38 +03:00
from .packages.pygments.lexers import (
2017-03-02 09:30:57 +03:00
_iter_lexerclasses,
_fn_matches,
basename,
2017-02-21 03:18:38 +03:00
ClassNotFound,
2019-03-31 04:53:40 +03:00
CppLexer,
2017-02-21 03:18:38 +03:00
find_lexer_class,
2016-04-19 01:26:28 +03:00
get_lexer_by_name,
)
2017-02-21 03:18:38 +03:00
from .packages.pygments.modeline import get_filetype_from_buffer
2013-09-23 00:51:23 +04:00
2016-04-29 01:04:46 +03:00
try:
from .packages import simplejson as json # pragma: nocover
except (ImportError, SyntaxError): # pragma: nocover
import json
2013-09-23 00:51:23 +04:00
log = logging.getLogger('WakaTime')
2013-09-23 00:51:23 +04:00
2017-02-21 03:18:38 +03:00
def get_file_stats(file_name, entity_type='file', lineno=None, cursorpos=None,
2018-09-21 08:29:34 +03:00
plugin=None, language=None, local_file=None):
2018-12-19 18:38:18 +03:00
"""Returns a hash of information about the entity."""
language = standardize_language(language, plugin)
stats = {
'language': language,
'dependencies': [],
'lines': None,
'lineno': lineno,
'cursorpos': cursorpos,
}
if entity_type == 'file':
lexer = get_lexer(language)
2017-02-21 03:18:38 +03:00
if not language:
2018-09-21 08:29:34 +03:00
language, lexer = guess_language(file_name, local_file)
parser = DependencyParser(local_file or file_name, lexer)
2018-12-19 18:38:18 +03:00
stats.update({
'language': use_root_language(language, lexer),
'dependencies': parser.parse(),
2018-09-21 08:29:34 +03:00
'lines': number_lines_in_file(local_file or file_name),
2018-12-19 18:38:18 +03:00
})
2017-02-21 03:18:38 +03:00
return stats
2018-09-21 08:29:34 +03:00
def guess_language(file_name, local_file):
2015-06-21 20:35:14 +03:00
"""Guess lexer and language for a file.
2017-03-02 09:30:57 +03:00
Returns a tuple of (language_str, lexer_obj).
2015-06-21 20:35:14 +03:00
"""
2017-03-02 09:30:57 +03:00
lexer = None
2015-08-25 10:42:37 +03:00
language = get_language_from_extension(file_name)
2017-03-02 09:30:57 +03:00
if language:
lexer = get_lexer(language)
else:
2018-09-21 08:29:34 +03:00
lexer = smart_guess_lexer(file_name, local_file)
2017-03-02 09:30:57 +03:00
if lexer:
language = u(lexer.name)
2015-06-21 20:35:14 +03:00
return language, lexer
2018-09-21 08:29:34 +03:00
def smart_guess_lexer(file_name, local_file):
2015-06-21 20:35:14 +03:00
"""Guess Pygments lexer for a file.
Looks for a vim modeline in file contents, then compares the accuracy
of that lexer with a second guess. The second guess looks up all lexers
matching the file name, then runs a text analysis for the best choice.
"""
lexer = None
2016-04-29 01:04:46 +03:00
text = get_file_head(file_name)
2015-06-21 20:35:14 +03:00
2018-09-21 08:29:34 +03:00
lexer1, accuracy1 = guess_lexer_using_filename(local_file or file_name, text)
2015-08-25 10:42:37 +03:00
lexer2, accuracy2 = guess_lexer_using_modeline(text)
2015-06-21 20:35:14 +03:00
2015-08-25 10:42:37 +03:00
if lexer1:
lexer = lexer1
if (lexer2 and accuracy2 and
2017-10-29 21:32:03 +03:00
(not accuracy1 or accuracy2 > accuracy1)):
2017-03-02 09:30:57 +03:00
lexer = lexer2
2015-06-21 20:35:14 +03:00
return lexer
def guess_lexer_using_filename(file_name, text):
"""Guess lexer for given text, limited to lexers for this file's extension.
Returns a tuple of (lexer, accuracy).
"""
lexer, accuracy = None, None
2013-09-23 00:51:23 +04:00
try:
2017-03-02 09:30:57 +03:00
lexer = custom_pygments_guess_lexer_for_filename(file_name, text)
2018-03-15 11:31:17 +03:00
except SkipHeartbeat as ex:
raise SkipHeartbeat(u(ex))
2017-02-21 03:18:38 +03:00
except:
2017-03-16 18:26:24 +03:00
log.traceback(logging.DEBUG)
2013-09-23 00:51:23 +04:00
2015-06-21 20:35:14 +03:00
if lexer is not None:
try:
accuracy = lexer.analyse_text(text)
2017-02-21 03:18:38 +03:00
except:
2017-03-16 18:26:24 +03:00
log.traceback(logging.DEBUG)
2015-06-21 20:35:14 +03:00
return lexer, accuracy
def guess_lexer_using_modeline(text):
"""Guess lexer for given text using Vim modeline.
Returns a tuple of (lexer, accuracy).
"""
lexer, accuracy = None, None
file_type = None
try:
file_type = get_filetype_from_buffer(text)
2015-09-29 13:11:25 +03:00
except: # pragma: nocover
2017-03-16 18:26:24 +03:00
log.traceback(logging.DEBUG)
2015-06-21 20:35:14 +03:00
if file_type is not None:
try:
lexer = get_lexer_by_name(file_type)
2016-06-17 11:17:29 +03:00
except ClassNotFound:
2017-03-16 18:26:24 +03:00
log.traceback(logging.DEBUG)
2015-06-21 20:35:14 +03:00
if lexer is not None:
try:
accuracy = lexer.analyse_text(text)
2015-09-29 13:11:25 +03:00
except: # pragma: nocover
2017-03-16 18:26:24 +03:00
log.traceback(logging.DEBUG)
2015-06-21 20:35:14 +03:00
return lexer, accuracy
2015-08-25 10:42:37 +03:00
def get_language_from_extension(file_name):
"""Returns a matching language for the given file extension.
2017-03-02 09:30:57 +03:00
When guessed_language is 'C', does not restrict to known file extensions.
2015-06-21 20:35:14 +03:00
"""
2015-09-29 13:11:25 +03:00
filepart, extension = os.path.splitext(file_name)
2019-03-31 04:53:40 +03:00
pathpart, filename = os.path.split(file_name)
if filename == 'go.mod':
return 'Go'
2015-09-29 13:11:25 +03:00
2018-03-15 11:31:17 +03:00
if re.match(r'\.h.*$', extension, re.IGNORECASE) or re.match(r'\.c.*$', extension, re.IGNORECASE):
2017-03-02 09:30:57 +03:00
if os.path.exists(u('{0}{1}').format(u(filepart), u('.c'))) or os.path.exists(u('{0}{1}').format(u(filepart), u('.C'))):
return 'C'
2015-09-29 13:11:25 +03:00
2018-03-15 11:31:17 +03:00
if os.path.exists(u('{0}{1}').format(u(filepart), u('.m'))) or os.path.exists(u('{0}{1}').format(u(filepart), u('.M'))):
return 'Objective-C'
if os.path.exists(u('{0}{1}').format(u(filepart), u('.mm'))) or os.path.exists(u('{0}{1}').format(u(filepart), u('.MM'))):
return 'Objective-C++'
2018-03-15 11:50:36 +03:00
available_extensions = extensions_in_same_folder(file_name)
2019-03-31 04:53:40 +03:00
for ext in CppLexer.filenames:
ext = ext.lstrip('*')
if ext in available_extensions:
return 'C++'
2018-03-15 11:50:36 +03:00
if '.c' in available_extensions:
return 'C'
2018-03-15 11:31:17 +03:00
if re.match(r'\.m$', extension, re.IGNORECASE) and (os.path.exists(u('{0}{1}').format(u(filepart), u('.h'))) or os.path.exists(u('{0}{1}').format(u(filepart), u('.H')))):
return 'Objective-C'
if re.match(r'\.mm$', extension, re.IGNORECASE) and (os.path.exists(u('{0}{1}').format(u(filepart), u('.h'))) or os.path.exists(u('{0}{1}').format(u(filepart), u('.H')))):
return 'Objective-C++'
2013-10-26 08:33:31 +04:00
return None
2013-09-23 00:51:23 +04:00
def number_lines_in_file(file_name):
2017-05-25 09:53:28 +03:00
try:
if os.path.getsize(file_name) > MAX_FILE_SIZE_SUPPORTED:
return None
except os.error:
pass
2013-09-23 00:51:23 +04:00
lines = 0
try:
2014-09-30 20:27:35 +04:00
with open(file_name, 'r', encoding='utf-8') as fh:
for line in fh:
2013-09-23 00:51:23 +04:00
lines += 1
2015-09-29 13:11:25 +03:00
except: # pragma: nocover
2015-08-25 10:42:37 +03:00
try:
with open(file_name, 'r', encoding=sys.getfilesystemencoding()) as fh:
for line in fh:
lines += 1
except:
return None
2013-09-23 00:51:23 +04:00
return lines
2016-04-29 01:04:46 +03:00
def standardize_language(language, plugin):
2017-03-02 09:30:57 +03:00
"""Maps a string to the equivalent Pygments language.
2018-12-19 18:38:18 +03:00
Returns the standardized language string.
2017-03-02 09:30:57 +03:00
"""
2016-04-29 01:04:46 +03:00
2017-02-21 03:18:38 +03:00
if not language:
2018-12-19 18:38:18 +03:00
return None
2017-02-21 03:18:38 +03:00
2016-04-29 01:04:46 +03:00
# standardize language for this plugin
if plugin:
plugin = plugin.split(' ')[-1].split('/')[0].split('-')[0]
standardized = get_language_from_json(language, plugin)
if standardized is not None:
2018-12-19 18:38:18 +03:00
return standardized
2016-04-29 01:04:46 +03:00
# standardize language against default languages
2018-12-19 18:38:18 +03:00
return get_language_from_json(language, 'default')
2017-03-02 09:30:57 +03:00
def get_lexer(language):
"""Return a Pygments Lexer object for the given language string."""
if not language:
return None
lexer_cls = find_lexer_class(language)
if lexer_cls:
return lexer_cls()
return None
2016-04-29 01:04:46 +03:00
2018-03-15 11:31:17 +03:00
def use_root_language(language, lexer):
2019-11-24 18:46:13 +03:00
override = {
'Coldfusion HTML': 'ColdFusion',
}
if language in override:
return override[language]
2018-03-15 11:31:17 +03:00
if lexer and hasattr(lexer, 'root_lexer'):
return u(lexer.root_lexer.name)
return language
2016-04-29 01:04:46 +03:00
def get_language_from_json(language, key):
"""Finds the given language in a json file."""
file_name = os.path.join(
os.path.dirname(__file__),
'languages',
'{0}.json').format(key.lower())
2017-03-16 18:26:24 +03:00
if os.path.exists(file_name):
try:
with open(file_name, 'r', encoding='utf-8') as fh:
languages = json.loads(fh.read())
if languages.get(language.lower()):
return languages[language.lower()]
except:
log.traceback(logging.DEBUG)
2016-04-29 01:04:46 +03:00
return None
def get_file_head(file_name):
"""Returns the first 512000 bytes of the file's contents."""
2015-06-21 20:35:14 +03:00
text = None
try:
with open(file_name, 'r', encoding='utf-8') as fh:
text = fh.read(512000)
2016-06-17 11:17:29 +03:00
except:
2015-08-25 10:42:37 +03:00
try:
with open(file_name, 'r', encoding=sys.getfilesystemencoding()) as fh:
2016-09-02 11:50:54 +03:00
text = fh.read(512000) # pragma: nocover
2015-08-25 10:42:37 +03:00
except:
2016-09-02 11:50:54 +03:00
log.traceback(logging.DEBUG)
2015-06-21 20:35:14 +03:00
return text
2017-03-02 09:30:57 +03:00
def custom_pygments_guess_lexer_for_filename(_fn, _text, **options):
"""Overwrite pygments.lexers.guess_lexer_for_filename to customize the
priority of different lexers based on popularity of languages."""
fn = basename(_fn)
primary = {}
matching_lexers = set()
for lexer in _iter_lexerclasses():
for filename in lexer.filenames:
if _fn_matches(fn, filename):
matching_lexers.add(lexer)
primary[lexer] = True
for filename in lexer.alias_filenames:
if _fn_matches(fn, filename):
matching_lexers.add(lexer)
primary[lexer] = False
if not matching_lexers:
raise ClassNotFound('no lexer for filename %r found' % fn)
if len(matching_lexers) == 1:
return matching_lexers.pop()(**options)
result = []
for lexer in matching_lexers:
rv = lexer.analyse_text(_text)
if rv == 1.0:
return lexer(**options)
2017-06-08 10:20:10 +03:00
result.append(customize_lexer_priority(_fn, rv, lexer))
2017-03-02 09:30:57 +03:00
2018-03-15 11:31:17 +03:00
matlab = list(filter(lambda x: x[2].name.lower() == 'matlab', result))
if len(matlab) > 0:
objc = list(filter(lambda x: x[2].name.lower() == 'objective-c', result))
if objc and objc[0][0] == matlab[0][0]:
raise SkipHeartbeat('Skipping because not enough language accuracy.')
2017-03-02 09:30:57 +03:00
def type_sort(t):
# sort by:
# - analyse score
# - is primary filename pattern?
# - priority
# - last resort: class name
2017-06-08 10:20:10 +03:00
return (t[0], primary[t[2]], t[1], t[2].__name__)
2017-03-02 09:30:57 +03:00
result.sort(key=type_sort)
2017-06-08 10:20:10 +03:00
return result[-1][2](**options)
2017-03-02 09:30:57 +03:00
2017-06-08 10:20:10 +03:00
def customize_lexer_priority(file_name, accuracy, lexer):
"""Customize lexer priority"""
priority = lexer.priority
2017-03-02 09:30:57 +03:00
2017-03-16 18:26:24 +03:00
lexer_name = lexer.name.lower().replace('sharp', '#')
if lexer_name in LANGUAGES:
2017-06-08 10:20:10 +03:00
priority = LANGUAGES[lexer_name]
elif lexer_name == 'matlab':
available_extensions = extensions_in_same_folder(file_name)
if '.mat' in available_extensions:
2018-03-15 11:31:17 +03:00
accuracy += 0.01
if '.h' not in available_extensions:
accuracy += 0.01
elif lexer_name == 'objective-c':
available_extensions = extensions_in_same_folder(file_name)
if '.mat' in available_extensions:
accuracy -= 0.01
else:
accuracy += 0.01
if '.h' in available_extensions:
accuracy += 0.01
2017-06-08 10:20:10 +03:00
return (accuracy, priority, lexer)
def extensions_in_same_folder(file_name):
"""Returns a list of file extensions from the same folder as file_name."""
directory = os.path.dirname(file_name)
files = os.listdir(directory)
extensions = list(zip(*map(os.path.splitext, files)))[1]
extensions = set([ext.lower() for ext in extensions])
return extensions